Example #1
0
    def get_tokens(self, text, unfiltered=False):
        """
        Return an iterable of (tokentype, value) pairs generated from
        `text`. If `unfiltered` is set to `True`, the filtering mechanism
        is bypassed even if filters are defined.

        Also preprocess the text, i.e. expand tabs and strip it if
        wanted and applies registered filters.
        """
        if isinstance(text, str):
            if self.stripall:
                text = text.strip()
            elif self.stripnl:
                text = text.strip('\n')

            if sys.version_info[0] < 3 and isinstance(text, str):
                text = StringIO(text.encode('utf-8'))
                self.encoding = 'utf-8'
            else:
                text = StringIO(text)

        def streamer():
            for i, t, v in self.get_tokens_unprocessed(text):
                yield t, v
        stream = streamer()
        if not unfiltered:
            stream = apply_filters(stream, self.filters, self)
        return stream
Example #2
0
    def get_tokens(self, text, unfiltered=False):
        """
        Return an iterable of (tokentype, value) pairs generated from
        `text`. If `unfiltered` is set to `True`, the filtering mechanism
        is bypassed even if filters are defined.

        Also preprocess the text, i.e. expand tabs and strip it if
        wanted and applies registered filters.
        """
        if isinstance(text, str):
            if self.stripall:
                text = text.strip()
            elif self.stripnl:
                text = text.strip('\n')

            if sys.version_info[0] < 3 and isinstance(text, str):
                text = StringIO(text.encode('utf-8'))
                self.encoding = 'utf-8'
            else:
                text = StringIO(text)

        def streamer():
            for i, t, v in self.get_tokens_unprocessed(text):
                yield t, v
        stream = streamer()
        if not unfiltered:
            stream = apply_filters(stream, self.filters, self)
        return stream
Example #3
0
	def run(self):
		timing_begin = time.time()
		app = Popen(['apertium', '-d', self.directory, self.mode], stdin=open(self.corpus), stdout=PIPE, close_fds=True)
		raw = app.communicate()[0].decode('utf-8')
		transfer = self.get_transfer(raw)
		del raw
		
		stripped = StringIO()
		for word, count in Counter(transfer).most_common():
			stripped.write("{:>6} {:<}\n".format(count, word))
		stripped = stripped.getvalue()
		
		app = Popen(['lt-proc', '-d', "%s.autogen.bin" % pjoin(self.directory, self.lang)], stdin=PIPE, stdout=PIPE, close_fds=True)
		surface = app.communicate(stripped.encode('utf-8'))[0].decode('utf-8')
		nofreq = re.sub(r'[\s\t]*\d*\s*\^', '^', stripped)
		
		gen_errors = StringIO()
		for i in itertools.zip_longest(surface.split('\n'), nofreq.split('\n'), fillvalue=""):
			gen_errors.write("{:<16}{:<16}\n".format(*list(str(x) for x in i)))
		gen_errors = gen_errors.getvalue().split('\n')

		multiform = []
		multibidix = []
		tagmismatch = []
		
		for i in gen_errors:
			if "#" in i:
				if re.search(r'[0-9] #.*\/', i):
					multibidix.append(i)
				elif re.search(r'[0-9] #', i) and not '/' in i:
					tagmismatch.append(i)
			elif "/" in i:
				multiform.append(i)

		self.multiform = multiform
		self.multibidix = multibidix
		self.tagmismatch = tagmismatch
		self.timer = time.time() - timing_begin
Example #4
0
assert s == "Hello,\nworld!"

s = bytes("Hello,\\tworld!", "utf-8").decode("unicode-escape")
assert s == "Hello,\tworld!"

s = bytes("Hello,\\bworld!", "utf-8").decode("unicode-escape")
assert s == "Hello,\bworld!"

# issue 1047
from io import StringIO
s = StringIO()
s.write(chr(8364))
assert s.getvalue() == "€"
s = chr(8364)
assert s == "€"
b = s.encode("utf-8")
assert b == bytes([0xe2, 0x82, 0xac])
s1 = b.decode("utf-8")
assert s1 == "€"


# issue 1049
class Mystring(str):
    pass


assert issubclass(Mystring, str)

# issue 1060
assert str(bytes('abc', encoding='ascii'), encoding='ascii') == "abc"
b = bytes('pythôn', encoding='utf-8')
Example #5
0
def render_gz(**payload):
    s3 = payload.get("s3")

    if s3 and s3["update"]:
        fmt = request.args.get("format")
        if fmt not in {"json", "csv"}:
            raise ValueError(f"Format {fmt} not supported!")

        df_keys = {"data", "columns", "index"}

        for obj in payload["data"]:
            for k in list(obj.keys()):
                if isinstance(obj[k], list):
                    if all("id" in d for d in obj[k]):
                        if fmt == "json":
                            obj[k] = [{"id": d["id"]} for d in obj[k]]
                        else:
                            obj[k] = {str(i): d["id"] for i, d in enumerate(obj[k])}
                    elif fmt == "csv" and k not in df_keys:
                        del obj[k]
                elif isinstance(obj[k], dict) and "id" in obj[k]:
                    obj[k] = {"id": obj[k]["id"]}

        if fmt == "json":
            content_type = "application/json"
            contents = ujson.dumps(payload["data"], allow_nan=True, default=encode_default)
        else:
            from pandas import DataFrame, json_normalize

            content_type = "text/csv"
            contents = None

            for obj in payload["data"]:
                if df_keys.issubset(obj.keys()):
                    try:
                        df = DataFrame.from_records(
                            obj["data"], columns=obj["columns"], index=obj["index"]
                        )
                    except Exception as ex:
                        print(str(ex))
                        continue

                    if contents is None:
                        contents = StringIO()
                    else:
                        contents.write("\n\n")

                    meta = json_normalize(
                        {k: v for k, v in obj.items() if k not in df_keys}
                    )
                    meta.to_csv(contents)
                    contents.write("\n")
                    df.to_csv(contents)

            if contents is None:
                contents = json_normalize(payload["data"]).to_csv()

        gzip_buffer = BytesIO()
        if isinstance(contents, StringIO):
            contents = contents.getvalue()

        with GzipFile(mode="wb", fileobj=gzip_buffer) as gzip_file:
            gzip_file.write(
                contents.encode("utf-8")
            )  # need to give full contents to compression

        body = gzip_buffer.getvalue()
        s3_client.put_object(
            Bucket=BUCKET,
            Key=s3["key"],
            ContentType=content_type,
            ContentEncoding="gzip",
            Body=body,
        )
        return body

    retr = s3_client.get_object(Bucket=BUCKET, Key=s3["key"])
    buffer = BytesIO(retr["Body"].read())
    return buffer.getvalue()