def test_incremental_encode(self): self.assertEquals( "".join(codecs.iterencode(u"python.org", "idna")), "python.org" ) self.assertEquals( "".join(codecs.iterencode(u"python.org.", "idna")), "python.org." ) self.assertEquals( "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), "xn--pythn-mua.org." ) self.assertEquals( "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), "xn--pythn-mua.org." ) encoder = codecs.getincrementalencoder("idna")() self.assertEquals(encoder.encode(u"\xe4x"), "") self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.") self.assertEquals(encoder.encode(u"", True), "org") encoder.reset() self.assertEquals(encoder.encode(u"\xe4x"), "") self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.") self.assertEquals(encoder.encode(u"", True), "")
def push_last(self, arg: str, extra_margin: int=0, human_trunc: bool=False) -> typing.Optional[str]: last_arg = self.args[-1] tags, line = self._format() n = len(line.encode("utf8")) # get length of current line n += self._margin # margin used for :hostmask if " " in arg and not " " in last_arg: n += 1 # +1 for colon on new arg n += extra_margin # used for things like (more ...) overflow: typing.Optional[str] = None if (n+len(arg.encode("utf8"))) > LINE_MAX: for i, char in enumerate(codecs.iterencode(arg, "utf8")): n += len(char) if n > LINE_MAX: arg, overflow = arg[:i], arg[i:] if human_trunc and not overflow[0] == " ": new_arg, sep, new_overflow = arg.rpartition(" ") if sep: arg = new_arg overflow = new_overflow+overflow break if arg: self.args[-1] = last_arg+arg return overflow
def handler(env, start_response): inp = env['wsgi.input'] reader = codecs.getreader("utf-8") state = json.load(reader(inp)) ret = {} for key,vtype in STATE_KEYS.items(): value = state[key] assert isinstance(value, vtype) ret[key] = value # validate the input, for sure. n = state["n"] if n < 2 or n > 201: ret["n"] = 101 values = valuesiter(ret) ret["values"] = values start_response('200 OK', [('Content-Type','application/json')]) je = json.JSONEncoder() viter = je.iterencode(ret) return codecs.iterencode(viter,"utf-8")
def handle_charref(self, name): if name.startswith('x') or name.startswith('X'): cp = int(name[1:], 16) else: cp = int(name) u = unichr(cp) self.buf += codecs.iterencode(u, 'utf-8')
def __init__(self, unicode_csvfile, *args, **kwargs): decoder = codecs.getdecoder('utf-8') self.decoder = lambda v: decoder(v)[0] utf8_csvfile = codecs.iterencode(unicode_csvfile, encoding='utf-8') # bollicks to csv.DictReader being an oldstyle class csv.DictReader.__init__(self, utf8_csvfile, *args, **kwargs) self.fieldnames = [self.decoder(f) for f in self.fieldnames]
def handler(env, start_response): inp = env['wsgi.input'] reader = codecs.getreader("utf-8") state = json.load(reader(inp)) ret = {} for key, vtype in STATE_KEYS.items(): value = state[key] assert isinstance(value, vtype) ret[key] = value # validate the input, for sure. n = state["n"] if n < 2 or n > 201: ret["n"] = 101 values = valuesiter(ret) ret["values"] = values start_response('200 OK', [('Content-Type', 'application/json')]) je = json.JSONEncoder() viter = je.iterencode(ret) return codecs.iterencode(viter, "utf-8")
def run_cgi_script(self, filesystem_path: pathlib.Path, environ: dict) -> Response: """ Execute the given file as a CGI script and return the script's stdout stream to the client. """ script_name = str(filesystem_path) cgi_env = environ.copy() cgi_env["GATEWAY_INTERFACE"] = "GCI/1.1" cgi_env["SCRIPT_NAME"] = script_name # Decode the stream as unicode so we can parse the status line # Use surrogateescape to preserve any non-UTF8 byte sequences. out = subprocess.Popen( [script_name], stdout=subprocess.PIPE, env=cgi_env, bufsize=1, universal_newlines=True, errors="surrogateescape", ) status_line = out.stdout.readline().strip() status_parts = status_line.split(maxsplit=1) if len(status_parts) != 2 or not status_parts[0].isdecimal(): return Response(Status.CGI_ERROR, "Unexpected Error") status, meta = status_parts # Re-encode the rest of the body as bytes body = codecs.iterencode(out.stdout, encoding="utf-8", errors="surrogateescape") return Response(int(status), meta, body)
def csvreader(f): try: for row in csv.reader(f): yield row except UnicodeEncodeError: for row in csv.reader(codecs.iterencode(f, 'utf-8')): yield [e.decode('utf-8') for e in row]
def serialize(self, out): """Write the units back to file.""" # Thanks to iterencode, a possible BOM is written only once for chunk in iterencode( (unit.getoutput() for unit in self.units), self.encoding ): out.write(chunk)
def iter_byte_indices(text, codec): ''' Iterate over the codepoint offset of each byte (any codec). ''' for i, b in enumerate(codecs.iterencode(text, codec)): for _ in b: yield i yield len(text)
def _utf8_iter_recoder(stream, encoding): """Generator re-encodes input file's lines from a given encoding to utf-8. :param stream: file handle. :param encoding: str of encoding. """ return codecs.iterencode(codecs.iterdecode(stream, encoding), "utf-8")
def test_incremental_encode(): from codecs import iterencode encoded = iterencode( (c for c in UNICODE.utf8), 'internet' ) encoded = ''.join(encoded) assert encoded == UNICODE.utf8.encode('UTF-8')
def __init__(self, f, fromenc=ENCODING, toenc=ENCODING, **kwargs): """ Reencoder constructor Args: f (obj): File-like object fromenc (str): The input encoding. toenc (str): The output encoding. Kwargs: remove_BOM (bool): Remove Byte Order Marker (default: True) decode (bool): Decode the text into a string (default: False) Examples: >>> encoding = 'utf-16-be' >>> eff = p.join(DATA_DIR, 'utf16_big.csv') >>> >>> with open(eff, 'rb') as f: ... reenc = Reencoder(f, encoding) ... first = reenc.readline(keepends=False) ... first.decode('utf-8') == '\ufeffa,b,c' ... reenc.read().decode('utf-8').split('\\n')[1] == '4,5,ʤ' True True >>> with open(eff, 'rb') as f: ... reenc = Reencoder(f, encoding, decode=True) ... reenc.readline(keepends=False) == '\ufeffa,b,c' True >>> with open(eff, 'rU', encoding=encoding) as f: ... reenc = Reencoder(f, remove_BOM=True) ... reenc.readline(keepends=False) == b'a,b,c' ... reenc.readline() == b'1,2,3\\n' ... reenc.readline().decode('utf-8') == '4,5,ʤ' True True True """ self.fileno = f.fileno first_line = next(f) bytes_mode = isinstance(first_line, BYTE_TYPE) decode = kwargs.get('decode') rencode = not decode if kwargs.get('remove_BOM'): strip = BOM.encode(fromenc) if bytes_mode else BOM first_line = first_line.lstrip(strip) chained = it.chain([first_line], f) if bytes_mode: decoded = iterdecode(chained, fromenc) self.binary = rencode else: decoded = chained self.binary = bytes_mode or rencode self.stream = iterencode(decoded, toenc) if rencode else decoded
def read_decoded_data(self, f): # This ensures that the raw result bytes we got are, in fact, valid utf-8, # replacing invalid bytes with �. Because python2's unicode support is # wonky, we re-encode the now-valid-utf-8 back into a str object so that # users don't need to deal with `unicode` objects. # The file contents can be large, so be careful to do the conversion in # chunks while streaming the data in, instead of requiring a full copy. n = 1 << 16 chunks = iter(lambda: f.read(n), '') decoded = codecs.iterdecode(chunks, 'utf-8', 'replace') return ''.join(codecs.iterencode(decoded, 'utf-8'))
def __iter__(self): root = ElementTree.fromstringlist(codecs.iterencode(self.inputreader, 'utf')) table = root.findall('.//table')[5] for row in table[2:-1]: date_str = row.find('.//td[3].nobr').text tdate = datetime.strptime(date_str, '%d.%m.%Y') amount_str = row.find('.//td[5].nobr').text amount_str = amount_str.replace('.', '') tamount = float(normalize_num(amount_str)) desc = plain_content(row.find('.//td[4]')) tmessage = normalize_field(desc) yield TransactionData(tdate, tamount, message=tmessage)
def iter_codepoint_indices(text, codec): ''' Iterate over the byte offset of each character (any codec). ''' # Note: for encodings with a BOM, the first offset probably shouldn't # be 0, but 2, 3, or 4, depending on the BOM's length. # This is ignored due to the lack of expected practical applications. i = 0 for b in codecs.iterencode(text, codec): yield i i += len(b) yield i
def stream(input, encoding=None, errors='strict'): """Safely iterate a template generator, ignoring ``None`` values and optionally stream encoding. Used internally by ``cinje.flatten``, this allows for easy use of a template generator as a WSGI body. """ input = (i for i in input if i) # Omits `None` (empty wrappers) and empty chunks. if encoding: # Automatically, and iteratively, encode the text if requested. input = iterencode(input, encoding, errors=errors) return input
def write_encoded_data(self, f): # Sometimes users give us invalid utf-8 data. They shouldn't, but it does # happen every once and a while. Just ignore it, and replace with �. # We're assuming users only want to write text data out. # self.data can be large, so be careful to do the conversion in chunks # while streaming the data out, instead of requiring a full copy. n = 1 << 16 # This is a generator expression, so this only copies one chunk of # self.data at any one time. chunks = (self.data[i:i + n] for i in xrange(0, len(self.data), n)) decoded = codecs.iterdecode(chunks, 'utf-8', 'replace') for chunk in codecs.iterencode(decoded, 'utf-8'): f.write(chunk)
def csv_unireader(f, encoding="utf-8"): if PY3: f = codecs.open(f, encoding=encoding) r = csv.reader(f, delimiter='|', quotechar='"') else: r = csv.reader( codecs.iterencode(codecs.iterdecode(open(f), encoding), 'utf-8'), delimiter=b'|', quotechar=b'"') for row in r: if PY3: yield row else: yield [e.decode("utf-8") for e in row]
def error(self, code: int, message: str, environ: Environ, start_response: StartResponseCallable) -> ResponseStream: statuses = { 400: 'Bad Request', 405: 'Method Not Allowed', } start_response( f'{code} {statuses[code]}', [('Content-Type', 'text/plain')] ) return codecs.iterencode(message, 'utf-8')
def encode_all(f=None, **kwargs): """ Encode unicode into bytes (str) """ names = kwargs.pop('fieldnames', None) encoding = kwargs.pop('encoding', None) if f else False decoded = codecs.iterdecode(f, encoding) if encoding else f ekwargs = {encode(k): encode(v) for k, v in kwargs.items()} fmtparams = {k: v for k, v in ekwargs.items() if k in dir(csv.Dialect)} res = { 'f': codecs.iterencode(decoded, ENCODING) if f else None, 'fieldnames': [encode(x) for x in names] if names else None, 'kwargs': ekwargs, 'fmtparams': fmtparams} return res
def write_text_resource(foutput, text, encoding='utf-8'): """Write a text resource :param foutput: path or file handle :type foutput: str, file :param text: content to write :type text: str, unicode, iterable :param encoding: which encoding to use (default: UTF-8) :type encoding: str """ if isinstance(foutput, file): for chunk in codecs.iterencode(text, encoding=encoding): foutput.write(chunk) else: with codecs.open(foutput, 'w', encoding=encoding) as fhandle: if isiterable(text): for line in text: fhandle.write(u"%s\n" % line) else: fhandle.write(text)
def __iter_extended_rows(self): # For PY2 encode/decode if six.PY2: # Reader requires utf-8 encoded stream bytes = iterencode(self.__chars, 'utf-8') items = csv.reader(bytes, **self.__options) for number, item in enumerate(items, start=1): values = [] for value in item: value = value.decode('utf-8') values.append(value) yield (number, None, list(values)) # For PY3 use chars else: items = csv.reader(self.__chars, **self.__options) for number, item in enumerate(items, start=1): yield (number, None, list(item))
def __emit_items(self): # For PY2 encode/decode if six.PY2: # Reader requires utf-8 encoded stream bytes = iterencode(self.__chars, 'utf-8') items = csv.reader(bytes, **self.__options) for item in items: values = [] for value in item: value = value.decode('utf-8') values.append(value) yield (None, tuple(values)) # For PY3 use chars else: items = csv.reader(self.__chars, **self.__options) for item in items: yield (None, tuple(item))
def read_docs(filename, column): ret_arr = [] #print_err("reading %s"%(filename)) dupes = 0 with open(filename, 'r') as infile: coder = codecs.iterencode(codecs.iterdecode(infile, "utf-8"), "utf-8") csvfile = csv.reader(coder, delimiter=',', quotechar='"') #next(csvfile) dup_checker = {} for row in csvfile: test = row[column].lower() if test not in dup_checker: dup_checker[test] = 1 ret_arr.append(row[column]) else: dupes += 1 print_err("total dupes: %d"%dupes) return ret_arr
def testParseResponse(self): # Parse a simple response to a subscribe: self.conn.write_socket(self.subscribe_cmd) parsed_resp = self.conn.parse_response() self.assertEqual([u'subscribe', u'tmp.20', 1L], parsed_resp) # Parse a more complicated incoming msg: # '*3\r\n' +\ # '$7\r\n' +\ # 'PUBLISH\r\n' +\ # '$4\r\n' +\ # 'test\r\n' +\ # '$11\r\n{"Hello World"}' # Make the OnDemandPublisher thread send us this # message: OneShotTester.answer_server.sendMessage(OneShotTester.test_msg, OneShotTester.from_channel) parsed_resp = self.conn.parse_response() # Get the message that was sent back to us: sent_bus_msg = OneShotTester.answer_server.outMsg out_id = sent_bus_msg.id rx_encoded = [] for el in codecs.iterencode(parsed_resp, 'UTF-8'): rx_encoded.append(el) expected = [ 'message', OneShotTester.from_channel, '{"content": "Hello world", "id": "%s", ' % out_id ] # Time of sent msg will be different each time; cut it out of the # rx-ed string: rxed_msg_body = rx_encoded[2] rxed_body_chop_pos = rxed_msg_body.index('"time": ') chopped_rxed_body = rxed_msg_body[:rxed_body_chop_pos] # Replaced msg body part of received by the # truncated version that doesn't include the time: rx_encoded[2] = chopped_rxed_body self.assertEqual(expected, rx_encoded)
def __iter_extended_rows(self): # For PY2 encode/decode if six.PY2: # Reader requires utf-8 encoded stream bytes = iterencode(self.__chars, 'utf-8') sample, dialect = self.__prepare_dialect(bytes) items = csv.reader(chain(sample, bytes), dialect=dialect) for number, item in enumerate(items, start=1): values = [] for value in item: value = value.decode('utf-8') values.append(value) yield (number, None, list(values)) # For PY3 use chars else: sample, dialect = self.__prepare_dialect(self.__chars) items = csv.reader(chain(sample, self.__chars), dialect=dialect) for number, item in enumerate(items, start=1): yield (number, None, list(item))
def encode_all(f=None, **kwargs): """ Encode unicode into bytes (str) """ names = kwargs.pop('fieldnames', None) encoding = kwargs.pop('encoding', None) if f else False if PY2: decoded = codecs.iterdecode(f, encoding) if encoding else f ekwargs = {encode(k): encode(v) for k, v in kwargs.items()} else: decoded, ekwargs = f, kwargs res = { 'f': codecs.iterencode(decoded, ENCODING) if f and PY2 else decoded, 'fieldnames': [encode(x) for x in names] if names and PY2 else names, 'drkwargs': use_keys_from(ekwargs, READER_KEYS), 'dwkwargs': use_keys_from(ekwargs, WRITER_KEYS), 'fmtparams': use_keys_from(ekwargs, FMTKEYS)} return res
def __iter_extended_rows(self): # For PY2 encode/decode if six.PY2: # Reader requires utf-8 encoded stream bytes = iterencode(self.__chars, 'utf-8') sample, dialect = self.__prepare_dialect(bytes) items = csv.reader(chain(sample, bytes), dialect=dialect) for row_number, item in enumerate(items, start=1): values = [] for value in item: value = value.decode('utf-8') values.append(value) yield (row_number, None, list(values)) # For PY3 use chars else: sample, dialect = self.__prepare_dialect(self.__chars) items = csv.reader(chain(sample, self.__chars), dialect=dialect) for row_number, item in enumerate(items, start=1): yield (row_number, None, list(item))
def testParseResponse(self): # Parse a simple response to a subscribe: self.conn.write_socket(self.subscribe_cmd) parsed_resp = self.conn.parse_response() self.assertEqual([u'subscribe', u'tmp.20', 1L], parsed_resp) # Parse a more complicated incoming msg: # '*3\r\n' +\ # '$7\r\n' +\ # 'PUBLISH\r\n' +\ # '$4\r\n' +\ # 'test\r\n' +\ # '$11\r\n{"Hello World"}' # Make the OnDemandPublisher thread send us this # message: OneShotTester.answer_server.sendMessage(OneShotTester.test_msg, OneShotTester.from_channel) parsed_resp = self.conn.parse_response() # Get the message that was sent back to us: sent_bus_msg = OneShotTester.answer_server.outMsg out_id = sent_bus_msg.id rx_encoded = [] for el in codecs.iterencode(parsed_resp, 'UTF-8'): rx_encoded.append(el) expected = ['message', OneShotTester.from_channel, '{"content": "Hello world", "id": "%s", ' % out_id ] # Time of sent msg will be different each time; cut it out of the # rx-ed string: rxed_msg_body = rx_encoded[2] rxed_body_chop_pos = rxed_msg_body.index('"time": ') chopped_rxed_body = rxed_msg_body[:rxed_body_chop_pos] # Replaced msg body part of received by the # truncated version that doesn't include the time: rx_encoded[2] = chopped_rxed_body self.assertEqual(expected, rx_encoded)
def testIncrementalEncoder(self): # Tests derived from Python standard library test/test_codecs.py incremental_tests = ( (u"python.org", b"python.org"), (u"python.org.", b"python.org."), (u"pyth\xf6n.org", b"xn--pythn-mua.org"), (u"pyth\xf6n.org.", b"xn--pythn-mua.org."), ) for decoded, encoded in incremental_tests: self.assertEqual(b"".join(codecs.iterencode(decoded, "idna")), encoded) encoder = codecs.getincrementalencoder("idna")() self.assertEqual(encoder.encode(u"\xe4x"), b"") self.assertEqual(encoder.encode(u"ample.org"), b"xn--xample-9ta.") self.assertEqual(encoder.encode(u"", True), b"org") encoder.reset() self.assertEqual(encoder.encode(u"\xe4x"), b"") self.assertEqual(encoder.encode(u"ample.org."), b"xn--xample-9ta.org.") self.assertEqual(encoder.encode(u"", True), b"")
def testIncrementalEncoder(self): # Tests derived from Python standard library test/test_codecs.py incremental_tests = ( ("python.org", b"python.org"), ("python.org.", b"python.org."), ("pyth\xf6n.org", b"xn--pythn-mua.org"), ("pyth\xf6n.org.", b"xn--pythn-mua.org."), ) for decoded, encoded in incremental_tests: self.assertEqual(b"".join(codecs.iterencode(decoded, "idna")), encoded) encoder = codecs.getincrementalencoder("idna")() self.assertEqual(encoder.encode("\xe4x"), b"") self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.") self.assertEqual(encoder.encode("", True), b"org") encoder.reset() self.assertEqual(encoder.encode("\xe4x"), b"") self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.") self.assertEqual(encoder.encode("", True), b"")
def csv_unireader(f, encoding="utf-8"): """ abre um arquivo utf-8 com a lib csv """ for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8"), delimiter="\t"): yield [e.decode("utf-8") for e in row]
file_list = sorted(glob.glob(args.inputs)) file_count = len(file_list) url_dict = {} # # step through each file # for filename_index in range(file_count): filename = file_list[filename_index] print "opening \"%s\" (%d of %d)" % ( filename, filename_index+1, file_count) with io.open(filename, "r", encoding="utf-8") as f: input_csv = csv.DictReader(codecs.iterencode(f, "utf-8")) for row in input_csv: url = row['url'] if url not in url_dict: url_dict[url] = row with io.open(args.output, "w", encoding="utf-8") as outfile: outfile.write(u"id,url,text\n") for row in url_dict.values(): tweet_id = row["id"] url = row["url"] text = row["text"] if text is not None: text = text.replace("\"", "\"\"")
def on_get(self, request, resp, **kwargs): """ Falcon resource method, for handling HTTP request GET method Falcon request provides: parameters embedded in URL via a keyword args dict, as well as convenience class variables falcon.HTTP_* FIXME: remove moduel pylint:disable= & refactor this overlong code block! """ # obtain logged in API user ID (if available) api_session_user = auth.get_user_id(request) # select data with warehouse.get_source_model_session() as dwsupport_model: sources = source.SourceUtil.get_list_of_data_sources( request.url ,auth.get_user_id(request) ,dwsupport_model) str_dataset_id = get_requested_dataset_id( sources, request, resp, kwargs) list_variables_requested_source = variables.get_list_of_variables( str_dataset_id) # convert 'datasets' into a list of variables list_requested_datasets = parameters.get_requested_datasets( request) list_variables_from_datasets = [] for str_id in list_requested_datasets: if str_dataset_id == str_id: list_variables_from_datasets = list_variables_requested_source break if str_dataset_id == 'warehouse': #FIXME: refactor this into a source.warehouse function #obtain the 'warehouse' field aliases for each dataset list_source_variables = variables.get_list_of_variables( str_id) for var in list_source_variables: warehouse_utils = api.resources.source.warehouse.warehouse str_alias = warehouse_utils.prefix_field_name( var, str_id) list_variables_from_datasets.append( str_alias) else: #error; not a warehouse request & Dataset does not match requested ID raise falcon.HTTPNotFound(description= "Unrecognized dataset: " + str_id) list_requested_variables = parameters.get_requested_variables( request) # add default variables if len(list_requested_variables) < 1: requested_default_query = parameters.get_list_requested_parameter( defaults.PARAMETER_NAME, request) try: default_variables = defaults.get_default_variables( requested_default_query ,str_dataset_id ,dwsupport_model) except defaults.UndefinedDefaultQuery as error: msg = ("Value {} is not defined for dataset: '{}'" .format(error, str_dataset_id)) raise falcon.HTTPInvalidParam(msg, defaults.PARAMETER_NAME) except defaults.AmbiguousDefaultQuery as error: msg = "More than one value was specified: {}".format(error) raise falcon.HTTPInvalidParam(msg, defaults.PARAMETER_NAME) except defaults.AmbiguousQueryHierarchy as error: raise falcon.HTTPBadRequest( #TODO: add functional test coverage title="Missing Parameter" ,description=( "Selection defaults not clear for" " data source: '{}'." " Selection must specify one or more 'variables='" " selection parameters (or a 'defaults=' parameter" " value from the following list: {})" ).format(str_dataset_id, error) ) list_requested_variables.extend(default_variables) # add variables derived from 'datasets' param list_requested_variables.extend( list_variables_from_datasets) list_requested_filters = parameters.get_requested_filters( request) # process pivot columns parameter try: pivot_column_variables = parameters.get_requested_pivot_columns( request ,str_dataset_id ,dwsupport_model['tables']) except parameters.PivotVariableError as err: raise falcon.HTTPInvalidParam( msg=str(err) ,param_name=parameters.ReservedParameterNames.pivot_columns ) from err # process 'Empty_cells' parameter try: empty_cell_dimensions = parameters.get_requested_empty_cells( request ,str_dataset_id ,dwsupport_model['tables'] ,dwsupport_model['associations'] ) except (parameters.EmptyCellsSourceError ,parameters.EmptyCellsDimensionError) as err: raise falcon.HTTPInvalidParam( msg=str(err) ,param_name=parameters.ReservedParameterNames.empty_cells ) from err # retrieve data start_time = datetime.now(pytz.timezone('US/Pacific')) try: result_generator = data.get_data(str_dataset_id ,list_requested_variables ,list_requested_filters ,pivot_column_variables ,empty_cell_dimensions ,user_id=api_session_user) except sqlalchemy.exc.DatabaseError as err: raise falcon.HTTPInternalServerError( title='500' ,description="Please try again" ) from err except data.NoSourceException as err: raise falcon.HTTPNotFound(description=("Source '{}' dataset not found:" " {}").format(str_dataset_id,err)) from err except parameters.FilterVariableError as err: #TODO: the bad HTTP parameter not always 'filters',sometimes a user-defined param (implicit-filter) #TODO: perhaps parameters should raise two different Exceptions? raise falcon.HTTPInvalidParam(str(err), 'filters') from err except data.NotAuthorizedException as error: raise falcon.HTTPUnauthorized( title='401' ,description=("Selection from sensitive data source '{}'" " not authorized").format(str_dataset_id) ) from error str_format_type = get_requested_format_type( kwargs) resp.content_type = FormatUtil.get_http_content_type(str_format_type) for data_source in sources: if data_source['id'] == str_dataset_id: formatter = FormatUtil(str_format_type, data_source, request, start_time) result_stream = formatter.format(result_generator) break chunked_stream = streaming.biggerchunks_stream(result_stream, 4)#2(13.6),3(13),4( if str_format_type == 'xlsx': byte_stream = chunked_stream #already bytes else: encoding = 'utf-8' if resp.content_type == 'text/csv': encoding = 'utf-8-sig' byte_stream = codecs.iterencode(chunked_stream, encoding) resp.stream = byte_stream#content
re_strip_mention = re.compile(u'(via\s)?@\w+', flags=re.IGNORECASE) re_strip_cc = re.compile(u'-CC', flags=re.IGNORECASE) re_strip_url = re.compile(u'(?i)\b((?:https?:?//|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', flags=re.IGNORECASE) re_strip_tco_url = re.compile(u'https?:?//[\w\-\.]*(/[\w\-\%]*)*( via)?', flags=re.IGNORECASE) re_strip_punct = re.compile(u'[\.,;\:\-\!\[\]\"\'\?\(\)\|@\/]', flags=re.IGNORECASE) re_strip_newline = re.compile(u'[\r\n\t]+') re_strip_ending_http = re.compile(u'(\s)ht?t?t?p?:?/?/?([\w\-\.]*)?\s?$', flags=re.IGNORECASE) word_split_re = re.compile(u'\@?\w+|[\$?-]*[\d\.]+|[a-zA-Z0-9\-\'\/]+|[!\?\.\,\/\@\:\-\_&\$\s]*', flags=re.IGNORECASE) for csvinput in args.inputfiles: with open(csvinput, 'r') as infile: coder = codecs.iterencode(codecs.iterdecode(infile, "utf-8"), "utf-8") csvreader = csv.reader(coder, delimiter=',', quotechar='"') # skip header next(csvreader) for row in csvreader: text = row[2] # strip retweet text = re_strip_rt.sub('', text) text = re_strip_mention.sub('', text) text = re_strip_cc.sub('', text) text = re_strip_url.sub('', text) text = re_strip_tco_url.sub('', text)
def handle(self, *args, **options): for file_name in args: csv_file = codecs.open(file_name, 'rb', 'cp1252') encoded_file = codecs.iterencode(csv_file, 'utf-8') self.consume_csv(encoded_file)
def csv_unireader(f, encoding="utf-8"): for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8")): yield [e.decode("utf-8") for e in row]
def _csv_unireader(f, encoding='utf-8'): return csv.DictReader(codecs.iterencode(codecs.iterdecode(f, encoding), 'utf-8'), delimiter=',')
sniffer = csv.Sniffer() for f in files: encoding = 'UTF-8' if f is not sys.stdin and quiet < 2: print ('Processing %s...' % f.name), sys.stdout.flush() data = f.read() prefix = data[:1024 ** 2] if f is not sys.stdin and chardet: encoding = chardet.detect(prefix)['encoding'] dialect = sniffer.sniff(prefix, delimiters=',\t') f = StringIO(data) if f is not sys.stdin and quiet < 2: print '(as %s)' % encoding if encoding != 'UTF-8': f = codecs.iterencode( codecs.iterdecode(f, encoding), 'utf-8') if args.dry_run: # just the headers data = [f.next()] else: data = list(f) reader = csv.DictReader(data, dialect=dialect) if args.dry_run: d = GetSupportingDefaultDict() importer(d) fieldnames = set(reader.fieldnames) d = set(d) used = fieldnames & d print 'Used keys (%i):' % len(used) for k in sorted(used): print '*', k
def serialize(self, out): """Write the units back to file.""" # Thanks to iterencode, a possible BOM is written only once for chunk in iterencode((unit.getoutput() for unit in self.units), self.encoding): out.write(chunk)
#coding=utf-8 ''' Created on 2012-5-4 @author: zhaojp ''' from codecs import iterencode; from codecs import iterdecode; echars = iterencode(['你', '好', '啊'], 'utf-8'); dchars = iterdecode(echars, 'utf-8'); for c in dchars: print(c)
def encode(input, output): # TODO tests for chunk in codecs.iterencode(chunks_text(input), 'utf-8'): output.write(chunk)
def csv_unireader(self, f, encoding="utf-8"): import codecs import csv for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8")): yield [e.decode("utf-8") for e in row]
def test_basics(self): s = u"abc123" # all codecs should be able to encode these for encoding in all_unicode_encodings: name = codecs.lookup(encoding).name if encoding.endswith("_codec"): name += "_codec" elif encoding == "latin_1": name = "latin_1" self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) (bytes, size) = codecs.getencoder(encoding)(s) if encoding != "unicode_internal": self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding)) (chars, size) = codecs.getdecoder(encoding)(bytes) self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) if encoding not in broken_unicode_with_streams: # check stream reader/writer q = Queue() writer = codecs.getwriter(encoding)(q) encodedresult = "" for c in s: writer.write(c) encodedresult += q.read() q = Queue() reader = codecs.getreader(encoding)(q) decodedresult = u"" for c in encodedresult: q.write(c) decodedresult += reader.read() self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) # check incremental decoder/encoder (fetched via the Python # and C API) and iterencode()/iterdecode() try: encoder = codecs.getincrementalencoder(encoding)() cencoder = _testcapi.codec_incrementalencoder(encoding) except LookupError: # no IncrementalEncoder pass else: # check incremental decoder/encoder encodedresult = "" for c in s: encodedresult += encoder.encode(c) encodedresult += encoder.encode(u"", True) decoder = codecs.getincrementaldecoder(encoding)() decodedresult = u"" for c in encodedresult: decodedresult += decoder.decode(c) decodedresult += decoder.decode("", True) self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) # check C API encodedresult = "" for c in s: encodedresult += cencoder.encode(c) encodedresult += cencoder.encode(u"", True) cdecoder = _testcapi.codec_incrementaldecoder(encoding) decodedresult = u"" for c in encodedresult: decodedresult += cdecoder.decode(c) decodedresult += cdecoder.decode("", True) self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) # check iterencode()/iterdecode() result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding)) self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding)) # check iterencode()/iterdecode() with empty string result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding)) self.assertEqual(result, u"")
def csv_unireader(f, encoding="utf-8", **fmtparams): data = csv.reader( codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8"), **fmtparams) for row in data: yield [e.decode("utf-8") for e in row]
def csv_unireader(f, encoding="utf-8"): for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8"), delimiter=';', quotechar='"'): yield [e.decode("utf-8") for e in row]
def csv_rows(file_obj, dialect=csv.excel, **kwargs): csvfile = iterencode(file_obj, 'utf-8') if PY2 else file_obj csvreader = csv.reader(csvfile, dialect=dialect, **kwargs) csvreader = (list(iterdecode(i, 'utf-8')) for i in csvreader) if PY2 else csvreader for row in csvreader: yield row
def csv_unireader(f, encoding="utf-8", delimiter=',', quotechar='"'): for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8"), delimiter=delimiter, quotechar=quotechar): yield [e.decode("utf-8") for e in row]
def csv_unireader(f, encoding="utf-8"): for row in csv.reader( codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8")): yield [e.decode("utf-8") for e in row]
def writelines(self, lines): # function scope import, but this is a bug workaround for pandas. from codecs import iterencode encoded_lines = iterencode(lines, self.encoding) self.bytes_filelike.writelines(encoded_lines)
def test_basics(self): s = u"abc123" # all codecs should be able to encode these for encoding in all_unicode_encodings: name = codecs.lookup(encoding).name if encoding.endswith("_codec"): name += "_codec" elif encoding == "latin_1": name = "latin_1" self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) (bytes, size) = codecs.getencoder(encoding)(s) if encoding != "unicode_internal": self.assertEqual( size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding)) (chars, size) = codecs.getdecoder(encoding)(bytes) self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) if encoding not in broken_unicode_with_streams: # check stream reader/writer q = Queue() writer = codecs.getwriter(encoding)(q) encodedresult = "" for c in s: writer.write(c) encodedresult += q.read() q = Queue() reader = codecs.getreader(encoding)(q) decodedresult = u"" for c in encodedresult: q.write(c) decodedresult += reader.read() self.assertEqual( decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) # check incremental decoder/encoder (fetched via the Python # and C API) and iterencode()/iterdecode() try: encoder = codecs.getincrementalencoder(encoding)() cencoder = _testcapi.codec_incrementalencoder(encoding) except LookupError: # no IncrementalEncoder pass else: # check incremental decoder/encoder encodedresult = "" for c in s: encodedresult += encoder.encode(c) encodedresult += encoder.encode(u"", True) decoder = codecs.getincrementaldecoder(encoding)() decodedresult = u"" for c in encodedresult: decodedresult += decoder.decode(c) decodedresult += decoder.decode("", True) self.assertEqual( decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) # check C API encodedresult = "" for c in s: encodedresult += cencoder.encode(c) encodedresult += cencoder.encode(u"", True) cdecoder = _testcapi.codec_incrementaldecoder(encoding) decodedresult = u"" for c in encodedresult: decodedresult += cdecoder.decode(c) decodedresult += cdecoder.decode("", True) self.assertEqual( decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) # check iterencode()/iterdecode() result = u"".join( codecs.iterdecode(codecs.iterencode(s, encoding), encoding)) self.assertEqual( result, s, "%r != %r (encoding=%r)" % (result, s, encoding)) # check iterencode()/iterdecode() with empty string result = u"".join( codecs.iterdecode(codecs.iterencode(u"", encoding), encoding)) self.assertEqual(result, u"")