def get_tokens(self, text, unfiltered=False): """ Return an iterable of (tokentype, value) pairs generated from `text`. If `unfiltered` is set to `True`, the filtering mechanism is bypassed even if filters are defined. Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. """ if isinstance(text, str): if self.stripall: text = text.strip() elif self.stripnl: text = text.strip('\n') if sys.version_info[0] < 3 and isinstance(text, str): text = StringIO(text.encode('utf-8')) self.encoding = 'utf-8' else: text = StringIO(text) def streamer(): for i, t, v in self.get_tokens_unprocessed(text): yield t, v stream = streamer() if not unfiltered: stream = apply_filters(stream, self.filters, self) return stream
def run(self): timing_begin = time.time() app = Popen(['apertium', '-d', self.directory, self.mode], stdin=open(self.corpus), stdout=PIPE, close_fds=True) raw = app.communicate()[0].decode('utf-8') transfer = self.get_transfer(raw) del raw stripped = StringIO() for word, count in Counter(transfer).most_common(): stripped.write("{:>6} {:<}\n".format(count, word)) stripped = stripped.getvalue() app = Popen(['lt-proc', '-d', "%s.autogen.bin" % pjoin(self.directory, self.lang)], stdin=PIPE, stdout=PIPE, close_fds=True) surface = app.communicate(stripped.encode('utf-8'))[0].decode('utf-8') nofreq = re.sub(r'[\s\t]*\d*\s*\^', '^', stripped) gen_errors = StringIO() for i in itertools.zip_longest(surface.split('\n'), nofreq.split('\n'), fillvalue=""): gen_errors.write("{:<16}{:<16}\n".format(*list(str(x) for x in i))) gen_errors = gen_errors.getvalue().split('\n') multiform = [] multibidix = [] tagmismatch = [] for i in gen_errors: if "#" in i: if re.search(r'[0-9] #.*\/', i): multibidix.append(i) elif re.search(r'[0-9] #', i) and not '/' in i: tagmismatch.append(i) elif "/" in i: multiform.append(i) self.multiform = multiform self.multibidix = multibidix self.tagmismatch = tagmismatch self.timer = time.time() - timing_begin
assert s == "Hello,\nworld!" s = bytes("Hello,\\tworld!", "utf-8").decode("unicode-escape") assert s == "Hello,\tworld!" s = bytes("Hello,\\bworld!", "utf-8").decode("unicode-escape") assert s == "Hello,\bworld!" # issue 1047 from io import StringIO s = StringIO() s.write(chr(8364)) assert s.getvalue() == "€" s = chr(8364) assert s == "€" b = s.encode("utf-8") assert b == bytes([0xe2, 0x82, 0xac]) s1 = b.decode("utf-8") assert s1 == "€" # issue 1049 class Mystring(str): pass assert issubclass(Mystring, str) # issue 1060 assert str(bytes('abc', encoding='ascii'), encoding='ascii') == "abc" b = bytes('pythôn', encoding='utf-8')
def render_gz(**payload): s3 = payload.get("s3") if s3 and s3["update"]: fmt = request.args.get("format") if fmt not in {"json", "csv"}: raise ValueError(f"Format {fmt} not supported!") df_keys = {"data", "columns", "index"} for obj in payload["data"]: for k in list(obj.keys()): if isinstance(obj[k], list): if all("id" in d for d in obj[k]): if fmt == "json": obj[k] = [{"id": d["id"]} for d in obj[k]] else: obj[k] = {str(i): d["id"] for i, d in enumerate(obj[k])} elif fmt == "csv" and k not in df_keys: del obj[k] elif isinstance(obj[k], dict) and "id" in obj[k]: obj[k] = {"id": obj[k]["id"]} if fmt == "json": content_type = "application/json" contents = ujson.dumps(payload["data"], allow_nan=True, default=encode_default) else: from pandas import DataFrame, json_normalize content_type = "text/csv" contents = None for obj in payload["data"]: if df_keys.issubset(obj.keys()): try: df = DataFrame.from_records( obj["data"], columns=obj["columns"], index=obj["index"] ) except Exception as ex: print(str(ex)) continue if contents is None: contents = StringIO() else: contents.write("\n\n") meta = json_normalize( {k: v for k, v in obj.items() if k not in df_keys} ) meta.to_csv(contents) contents.write("\n") df.to_csv(contents) if contents is None: contents = json_normalize(payload["data"]).to_csv() gzip_buffer = BytesIO() if isinstance(contents, StringIO): contents = contents.getvalue() with GzipFile(mode="wb", fileobj=gzip_buffer) as gzip_file: gzip_file.write( contents.encode("utf-8") ) # need to give full contents to compression body = gzip_buffer.getvalue() s3_client.put_object( Bucket=BUCKET, Key=s3["key"], ContentType=content_type, ContentEncoding="gzip", Body=body, ) return body retr = s3_client.get_object(Bucket=BUCKET, Key=s3["key"]) buffer = BytesIO(retr["Body"].read()) return buffer.getvalue()