Esempio n. 1
0
def FileFlush(file_handle: io.BufferedRandom) -> None:
    file_handle.flush()
Esempio n. 2
0
class BinaryCopy():
    def __init__(self, chunk_size=256, buffer_size=64 * 1024 * 1024):
        self.chunk_size = chunk_size
        self.buffer_size = buffer_size
        self.fs = BufferedRandom(BytesIO(), buffer_size=buffer_size)

        self._row_header_struct = Struct("!hiqiq")
        self._row_int_struct = Struct("!i")
        self._row_bigint_struct = Struct("!iq")
        self._row_null_val = pack("!i", -1)

    def write_binary_header(self):
        self.fs.write(pack('!11sii', b'PGCOPY\n\xff\r\n\0', 0, 0))

    def write_binary_string(self, obj, keyname):
        try:
            val = obj[keyname]

            if val is None:
                self.fs.write(self._row_null_val)
            else:
                val = val.encode()
                self.fs.write(self._row_int_struct.pack(len(val)))
                self.fs.write(val)
        except KeyError as e:
            self.fs.write(self._row_null_val)

    def write_binary_bigint(self, val):
        if val is None:
            self.fs.write(self._row_null_val)
        else:
            self.fs.write(self._row_bigint_struct.pack(8, val))

    def write_comment_row(self, row):
        obj = row[1]
        self.fs.write(
            _row_header_struct.pack(
                8, 8, int(obj["id"], 36), 8,
                timestamp_to_pgtimestamp(int(obj["created_utc"]))))

        # write article id
        # these have to be written separately because of possible null
        self.write_binary_bigint(get_article(obj.get("link_id", None)))
        self.write_binary_bigint(get_sub_id(obj.get("subreddit_id", None)))
        self.write_binary_bigint(get_parent(obj.get("parent_id", None)))

        # write strings
        self.write_binary_string(obj, "author")
        self.write_binary_string(obj, "subreddit")

        # write jsonb data
        data = row[0].encode()
        self.fs.write(pack("!ib", len(data) + 1, 1))
        self.fs.write(data)

    def write_submission_row(self, row):
        obj = row[1]
        self.fs.write(
            _row_header_struct.pack(
                6, 8, int(obj["id"], 36), 8,
                timestamp_to_pgtimestamp(int(obj["created_utc"]))))

        # write article id
        # these have to be written separately because of possible null
        self.write_binary_bigint(get_sub_id(obj.get("subreddit_id", None)))

        # write strings
        self.write_binary_string(obj, "author")
        self.write_binary_string(obj, "subreddit")

        # write jsonb data
        data = row[0].encode()
        self.fs.write(pack("!ib", len(data) + 1, 1))
        self.fs.write(data)

    def copy_comments(self, conn, table, lines):
        self.write_binary_header()

        for l in lines:
            self.write_comment_row(l)

        # write end of task
        self.fs.write(pack('!h', -1))
        self.fs.flush()
        self.fs.seek(0)

        conn.cursor.copy_expert("copy %s from stdin with binary " % (table),
                                self.fs)

        self.fs.seek(0)
        self.fs.truncate()

    def copy(self, conn, table, lines, thing_type):
        self.write_binary_header()

        if thing_type == "comments":
            for l in lines:
                self.write_comment_row(l)
        elif thing_type == "submissions":
            for l in lines:
                self.write_submission_row(l)
        else:
            raise Exception("Unknown thing type {}".format(thing_type))

        # write end of task
        self.fs.write(pack('!h', -1))
        self.fs.flush()
        self.fs.seek(0)

        conn.cursor.copy_expert("copy %s from stdin with binary " % (table),
                                self.fs)

        self.fs.seek(0)
        self.fs.truncate()