コード例 #1
0
ファイル: postgres.py プロジェクト: alialliallie/shiftmanager
    def get_csv_chunk_generator(self, csv_file_path, row_count, chunks):
        """
        Given the csv_file_path and a row_count, yield chunks number
        of string chunks

        Parameters
        ----------
        csv_file_path: str
            File path for the CSV written by Postgres
        row_count: int
            Number of rows in the CSV
        chunks: int
            Number of chunks to yield

        Yields
        ------
        str
        """
        # Yield only a single chunk if the number of rows is small.
        if row_count <= chunks:
            with open(csv_file_path, "r") as f:
                yield f.read()
            raise StopIteration

        # Get chunk boundaries
        left_closed_boundary = util.linspace(0, row_count, chunks)
        left_closed_boundary.append(row_count - 1)
        right_closed_boundary = left_closed_boundary[1:]
        final_boundary_index = len(right_closed_boundary) - 1

        # We're going to allocate a large buffer for this -- let's read as fast
        # as possible
        chunk_lines = []
        boundary_index = 0
        boundary = right_closed_boundary[boundary_index]
        one_mebibyte = 1048576
        with open(csv_file_path, "r", one_mebibyte) as f:
            for line_number, row in enumerate(f):
                chunk_lines.append(row)
                if line_number == boundary:
                    if boundary_index != final_boundary_index:
                        boundary_index += 1
                        boundary = right_closed_boundary[boundary_index]
                    yield "".join(chunk_lines)
                    chunk_lines = []
コード例 #2
0
ファイル: s3.py プロジェクト: wallawaz/shiftmanager
    def chunked_json_slices(data, slices, directory=None, clean_on_exit=True):
        """
        Given an iterator of dicts, chunk them into *slices* and write to
        temp files on disk. Clean up when leaving scope.

        Parameters
        ----------
        data : iter of dicts
            Iterable of dictionaries to be serialized to chunks
        slices : int
            Number of chunks to generate
        dir : str
            Dir to write chunks to. Will default to $HOME/.shiftmanager/tmp/
        clean_on_exit : bool, default True
            Clean up chunks on disk when context exits

        Returns
        -------
        stamp : str
            Timestamp that prepends the filenames of chunks written to disc
        chunk_files : list
            List of filenames
        """

        # Ensure that files get cleaned up even on raised exception
        try:
            num_data = len(data)
            chunk_range_start = util.linspace(0, num_data, slices)
            chunk_range_end = chunk_range_start[1:]
            chunk_range_end.append(None)
            stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")

            if not directory:
                user_home = os.path.expanduser("~")
                directory = os.path.join(user_home, ".shiftmanager", "tmp")

            if not os.path.exists(directory):
                os.makedirs(directory)

            chunk_files = []
            range_zipper = list(zip(chunk_range_start, chunk_range_end))
            for i, (inclusive, exclusive) in enumerate(range_zipper):

                # Get either a inc/excl slice,
                # or the slice to the end of the range
                if exclusive is not None:
                    sliced = data[inclusive:exclusive]
                else:
                    sliced = data[inclusive:]

                newlined = ""
                for doc in sliced:
                    newlined = "{}{}\n".format(newlined, json.dumps(doc))

                filepath = "{}.gz".format("-".join([stamp, str(i)]))
                write_path = os.path.join(directory, filepath)
                current_fp = gzip.open(write_path, 'wb')
                current_fp.write(newlined.encode("utf-8"))
                current_fp.close()
                chunk_files.append(write_path)

            yield stamp, chunk_files

        finally:
            if clean_on_exit:
                for filepath in chunk_files:
                    os.remove(filepath)
コード例 #3
0
    def chunked_json_slices(data, slices, directory=None, clean_on_exit=True):
        """
        Given an iterator of dicts, chunk them into *slices* and write to
        temp files on disk. Clean up when leaving scope.

        Parameters
        ----------
        data : iter of dicts
            Iterable of dictionaries to be serialized to chunks
        slices : int
            Number of chunks to generate
        dir : str
            Dir to write chunks to. Will default to $HOME/.shiftmanager/tmp/
        clean_on_exit : bool, default True
            Clean up chunks on disk when context exits

        Returns
        -------
        stamp : str
            Timestamp that prepends the filenames of chunks written to disc
        chunk_files : list
            List of filenames
        """

        # Ensure that files get cleaned up even on raised exception
        try:
            num_data = len(data)
            chunk_range_start = util.linspace(0, num_data, slices)
            chunk_range_end = chunk_range_start[1:]
            chunk_range_end.append(None)
            stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")

            if not directory:
                user_home = os.path.expanduser("~")
                directory = os.path.join(user_home, ".shiftmanager", "tmp")

            if not os.path.exists(directory):
                os.makedirs(directory)

            chunk_files = []
            range_zipper = list(zip(chunk_range_start, chunk_range_end))
            for i, (inclusive, exclusive) in enumerate(range_zipper):

                # Get either a inc/excl slice,
                # or the slice to the end of the range
                if exclusive is not None:
                    sliced = data[inclusive:exclusive]
                else:
                    sliced = data[inclusive:]

                newlined = ""
                for doc in sliced:
                    newlined = "{}{}\n".format(newlined, json.dumps(doc))

                filepath = "{}.gz".format("-".join([stamp, str(i)]))
                write_path = os.path.join(directory, filepath)
                current_fp = gzip.open(write_path, 'wb')
                current_fp.write(newlined.encode("utf-8"))
                current_fp.close()
                chunk_files.append(write_path)

            yield stamp, chunk_files

        finally:
            if clean_on_exit:
                for filepath in chunk_files:
                    os.remove(filepath)