Example #1
0
class TombOutputThread(QtCore.QThread):
    line_received = QtCore.pyqtSignal(QtCore.QString)
    error_received = QtCore.pyqtSignal(QtCore.QString)
    progressed = QtCore.pyqtSignal(int) #value in percent

    def __init__(self):
        QtCore.QThread.__init__(self)
        self.buffer = NamedTemporaryFile()

    def run(self):
        while True:
            where = self.buffer.tell()
            line = self.buffer.readline()
            if not line:
                time.sleep(1)
                self.buffer.seek(where)
            else:
                #ansi color escapes messes this up, but it'ok anyway
                self.line_received.emit(line)
                self.parse_line(line)

    def parse_line(self, line):
        #This could be simplified, and s/search/match, if --no-color supported
        #see #59
        #TODO: this should be moved to tomblib.parse
        parsed = parse_line(line)
        if parsed and parsed['type'] == 'error':
            self.error_received.emit(parsed.content)
Example #2
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(
            map(lambda schema_tuple: schema_tuple[0].replace(' ', '_'),
                cursor.description))
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            # Convert if needed
            row = map(self.convert_types, row)
            row_dict = dict(zip(schema, row))

            s = json.dumps(row_dict, sort_keys=True)
            s = s.encode('utf-8')
            tmp_file_handle.write(s)

            # Append newline to make dumps BQ compatible
            tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(
                    file_no)] = tmp_file_handle

        return tmp_file_handles
Example #3
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.
        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(
            map(lambda schema_tuple: schema_tuple[0], cursor.description))
        col_type_dict = self._get_col_type_dict()
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        if self.export_format == 'csv':
            file_mime_type = 'text/csv'
        else:
            file_mime_type = 'application/json'
        files_to_upload = [{
            'file_name': self.filename.format(file_no),
            'file_handle': tmp_file_handle,
            'file_mime_type': file_mime_type
        }]

        if self.export_format == 'csv':
            csv_writer = self._configure_csv_file(tmp_file_handle, schema)

        for row in cursor:
            # Convert datetime objects to utc seconds, and decimals to floats.
            # Convert binary type object to string encoded with base64.
            row = self._convert_types(schema, col_type_dict, row)

            if self.export_format == 'csv':
                csv_writer.writerow(row)
            else:
                row_dict = dict(zip(schema, row))

                # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB.
                s = json.dumps(row_dict, sort_keys=True).encode('utf-8')
                tmp_file_handle.write(s)

                # Append newline to make dumps BigQuery compatible.
                tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                files_to_upload.append({
                    'file_name':
                    self.filename.format(file_no),
                    'file_handle':
                    tmp_file_handle,
                    'file_mime_type':
                    file_mime_type
                })

                if self.export_format == 'csv':
                    csv_writer = self._configure_csv_file(
                        tmp_file_handle, schema)

        return files_to_upload
Example #4
0
def xls_export(request, username, id_string):
    owner = get_object_or_404(User, username=username)
    xform = get_object_or_404(XForm, id_string=id_string, user=owner)
    if not has_permission(xform, owner, request):
        return HttpResponseForbidden(_(u"Not shared."))
    query = request.GET.get("query")
    force_xlsx = request.GET.get("xlsx") == "true"
    xls_df_builder = XLSDataFrameBuilder(username, id_string, query)
    excel_defs = {
        "xls": {"suffix": ".xls", "mime_type": "vnd.ms-excel"},
        "xlsx": {"suffix": ".xlsx", "mime_type": "vnd.openxmlformats"},  # TODO: check xlsx mime type
    }
    ext = "xls" if not force_xlsx else "xlsx"
    if xls_df_builder.exceeds_xls_limits:
        ext = "xlsx"
    try:
        temp_file = NamedTemporaryFile(suffix=excel_defs[ext]["suffix"])
        xls_df_builder.export_to(temp_file.name)

        if request.GET.get("raw"):
            id_string = None
        response = response_with_mimetype_and_name(excel_defs[ext]["mime_type"], id_string, extension=ext)
        response.write(temp_file.read())
        temp_file.seek(0, os.SEEK_END)
        response["Content-Length"] = temp_file.tell()
        temp_file.close()
        return response
    except NoRecordsFoundError:
        return HttpResponse(_("No records found to export"))
Example #5
0
def make_image_file(dimensions=(320, 240), extension=".jpeg", force_size=None):
    """
    Yields a named temporary file created with the specified image type and
    options.

    Note the default dimensions are unequal (not a square) ensuring that center-square
    cropping logic will be exercised during tests.

    The temporary file will be closed and deleted automatically upon exiting
    the `with` block.
    """
    image = Image.new('RGB', dimensions, "green")
    image_file = NamedTemporaryFile(suffix=extension)
    try:
        image.save(image_file)
        if force_size is not None:
            image_file.seek(0, os.SEEK_END)
            bytes_to_pad = force_size - image_file.tell()
            # write in hunks of 256 bytes
            hunk, byte_ = bytearray([0] * 256), bytearray([0])
            num_hunks, remainder = divmod(bytes_to_pad, 256)
            for _ in xrange(num_hunks):
                image_file.write(hunk)
            for _ in xrange(remainder):
                image_file.write(byte_)
            image_file.flush()
        image_file.seek(0)
        yield image_file
    finally:
        image_file.close()
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.
        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            self.log.info(row)
            if PY3:
                row = row.encode('utf-8')
            tmp_file_handle.write(row)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(
                    file_no)] = tmp_file_handle

        return tmp_file_handles
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(map(lambda schema_tuple: schema_tuple[0], cursor.description))
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            # Convert datetime objects to utc seconds, and decimals to floats
            row = map(self.convert_types, row)
            row_dict = dict(zip(schema, row))

            # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB.
            s = json.dumps(row_dict)
            if PY3:
                s = s.encode('utf-8')
            tmp_file_handle.write(s)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle

        return tmp_file_handles
Example #8
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(map(lambda schema_tuple: schema_tuple[0].replace(' ', '_'), cursor.description))
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            # Convert if needed
            row = map(self.convert_types, row)
            row_dict = dict(zip(schema, row))

            s = json.dumps(row_dict, sort_keys=True)
            s = s.encode('utf-8')
            tmp_file_handle.write(s)

            # Append newline to make dumps BQ compatible
            tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle

        return tmp_file_handles
Example #9
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = map(lambda schema_tuple: schema_tuple[0], cursor.description)
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            # Convert datetime objects to utc seconds, and decimals to floats
            row = map(self.convert_types, row)
            row_dict = dict(zip(schema, row))

            # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB.
            json.dump(row_dict, tmp_file_handle)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write('\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle

        return tmp_file_handles
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}
        for row in cursor:
            row_dict = self.generate_data_dict(row._fields, row)
            content = json.dumps(row_dict).encode('utf-8')
            tmp_file_handle.write(content)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write(b'\n')

            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(
                    file_no)] = tmp_file_handle

        return tmp_file_handles
Example #11
0
def make_image_file(dimensions=(320, 240), extension=".jpeg", force_size=None, orientation=None):
    """
    Yields a named temporary file created with the specified image type and
    options.

    Note the default dimensions are unequal (not a square) ensuring that center-square
    cropping logic will be exercised during tests.

    The temporary file will be closed and deleted automatically upon exiting
    the `with` block.
    """
    image = Image.new('RGB', dimensions, "green")
    image_file = NamedTemporaryFile(suffix=extension)
    try:
        if orientation and orientation in xrange(1, 9):
            exif_bytes = piexif.dump({'0th': {piexif.ImageIFD.Orientation: orientation}})
            image.save(image_file, exif=exif_bytes)
        else:
            image.save(image_file)
        if force_size is not None:
            image_file.seek(0, os.SEEK_END)
            bytes_to_pad = force_size - image_file.tell()
            # write in hunks of 256 bytes
            hunk, byte_ = bytearray([0] * 256), bytearray([0])
            num_hunks, remainder = divmod(bytes_to_pad, 256)
            for _ in xrange(num_hunks):
                image_file.write(hunk)
            for _ in xrange(remainder):
                image_file.write(byte_)
            image_file.flush()
        image_file.seek(0)
        yield image_file
    finally:
        image_file.close()
Example #12
0
def make_image_file(dimensions=(320, 240), extension=".jpeg", force_size=None, orientation=None):
    """
    Yields a named temporary file created with the specified image type and
    options.

    The temporary file will be closed and deleted automatically upon exiting
    the `with` block.
    """
    image = Image.new('RGB', dimensions, "green")
    image_file = NamedTemporaryFile(suffix=extension)
    try:
        if orientation and orientation in xrange(1, 9):
            exif_bytes = piexif.dump({'0th': {piexif.ImageIFD.Orientation: orientation}})
            image.save(image_file, exif=exif_bytes)
        else:
            image.save(image_file)
        if force_size is not None:
            image_file.seek(0, os.SEEK_END)
            bytes_to_pad = force_size - image_file.tell()
            # write in hunks of 256 bytes
            hunk, byte_ = bytearray([0] * 256), bytearray([0])
            num_hunks, remainder = divmod(bytes_to_pad, 256)
            for _ in xrange(num_hunks):
                image_file.write(hunk)
            for _ in xrange(remainder):
                image_file.write(byte_)
            image_file.flush()
        image_file.seek(0)
        yield image_file
    finally:
        image_file.close()
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}
        for row in cursor:
            row_dict = self.generate_data_dict(row._fields, row)
            s = json.dumps(row_dict)
            if PY3:
                s = s.encode('utf-8')
            tmp_file_handle.write(s)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write(b'\n')

            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle

        return tmp_file_handles
Example #14
0
def convert_icon(filelike: IO[bytes], pixel_size: int) -> IO[bytes]:
    """Convert the input image (often ICO format) into a (crushed) 32x32 PNG.

    Crushing is important because there will be a lot of these images and we
    want to keep them small to reduce s3 costs and ensure that they are
    retained in HTTP caches as long as possible."""
    image = Image.open(filelike)

    # Created a named temporary file, with auto deletion off
    try:
        temp_file = NamedTemporaryFile(mode="r+b",
                                       delete=False,
                                       prefix="quarchive-tmp-icon-",
                                       suffix=".png")

        resized = image.resize((pixel_size, pixel_size),
                               resample=Image.LANCZOS)
        resized.save(temp_file, format="png")
        initial_size = temp_file.tell()

        # Close the handle to write out new image file to the fs
        temp_file.close()

        # pngcrush does a lot better than PIL at optimizing (50% or more)
        crushed_filename = f"{temp_file}.crushed"
        result = subprocess.run(
            ["pngcrush", temp_file.name, crushed_filename],
            capture_output=True,
        )

        # Raise an exception if something went wrong
        try:
            result.check_returncode()
        except subprocess.CalledProcessError:
            log.error(
                "pngcrush failed: stdout='%s', stderr='%s'",
                result.stdout,
                result.stderr,
            )
            raise

        # Open a handle to the new, crushed, png
        rv = open(crushed_filename, mode="r+b")

        # Log out the size reduction
        rv.seek(0, 2)
        crushed_size = rv.tell()
        rv.seek(0)
        log.debug("reduced image from %d bytes to %d", initial_size,
                  crushed_size)

    finally:
        # clean up our temp file
        os.remove(temp_file.name)
        # then delete the underlying file (existing handle will continue to
        # work due to CoW)
        os.remove(crushed_filename)

    return rv
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(map(lambda schema_tuple: schema_tuple[0], cursor.description))
        col_type_dict = self._get_col_type_dict()
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        if self.export_format == 'csv':
            file_mime_type = 'text/csv'
        else:
            file_mime_type = 'application/json'
        files_to_upload = [{
            'file_name': self.filename.format(file_no),
            'file_handle': tmp_file_handle,
            'file_mime_type': file_mime_type
        }]

        if self.export_format == 'csv':
            csv_writer = self._configure_csv_file(tmp_file_handle, schema)

        for row in cursor:
            # Convert datetime objects to utc seconds, and decimals to floats.
            # Convert binary type object to string encoded with base64.
            row = self._convert_types(schema, col_type_dict, row)

            if self.export_format == 'csv':
                csv_writer.writerow(row)
            else:
                row_dict = dict(zip(schema, row))

                # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB.
                s = json.dumps(row_dict, sort_keys=True)
                if PY3:
                    s = s.encode('utf-8')
                tmp_file_handle.write(s)

                # Append newline to make dumps BigQuery compatible.
                tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                files_to_upload.append({
                    'file_name': self.filename.format(file_no),
                    'file_handle': tmp_file_handle,
                    'file_mime_type': file_mime_type
                })

                if self.export_format == 'csv':
                    csv_writer = self._configure_csv_file(tmp_file_handle, schema)

        return files_to_upload
Example #16
0
    def write(self, make_backup=True):
        """
        Write the GUANO .WAV file to disk.

        :param bool make_backup:  create a backup file copy before writing changes or not (default: True);
                                  backups will be saved to a folder named `GUANO_BACKUP`
        :raises ValueError:  if this `GuanoFile` doesn't represent a valid .WAV by having
            appropriate values for `self.wav_params` (see :meth:`wave.Wave_write.setparams()`)
            and `self.wav_data` (see :meth:`wave.Wave_write.writeframes()`)
        """
        # FIXME: optionally write other unknown subchunks for redundant metadata formats

        if not self.filename:
            raise ValueError('Cannot write .WAV file without a self.filename!')
        if not self.wav_params:
            raise ValueError('Cannot write .WAV file without appropriate self.wav_params (see `wavfile.setparams()`)')
        if not self.wav_data:
            raise ValueError('Cannot write .WAV file without appropriate self.wav_data (see `wavfile.writeframes()`)')

        # prepare our metadata for a byte-wise representation
        md_bytes = self.serialize()

        # create tempfile and write our vanilla .WAV ('data' sub-chunk only)
        tempfile = NamedTemporaryFile(mode='w+b', prefix='guano_temp-', suffix='.wav', delete=False)
        if os.path.isfile(self.filename):
            shutil.copystat(self.filename, tempfile.name)

        with closing(wave.Wave_write(tempfile)) as wavfile:
            wavfile.setparams(self.wav_params)
            wavfile.writeframes(self.wav_data)

        # add the 'guan' sub-chunk after the 'data' sub-chunk
        tempfile.write(_chunkid.pack(b'guan'))
        tempfile.write(_chunksz.pack(len(md_bytes)))
        tempfile.write(md_bytes)

        # fix the RIFF file length
        total_size = tempfile.tell()
        tempfile.seek(0x04)
        tempfile.write(_chunksz.pack(total_size - 8))
        tempfile.close()

        # verify it by re-parsing the new version
        GuanoFile(tempfile.name)

        # finally overwrite the original with our new version (and optionally back up first)
        if make_backup and os.path.exists(self.filename):
            backup_dir = os.path.join(os.path.dirname(self.filename), 'GUANO_BACKUP')
            backup_file = os.path.join(backup_dir, os.path.basename(self.filename))
            if not os.path.isdir(backup_dir):
                log.debug('Creating backup dir: %s', backup_dir)
                os.mkdir(backup_dir)
            if os.path.exists(backup_file):
                os.remove(backup_file)
            os.rename(self.filename, backup_file)
        os.rename(tempfile.name, self.filename)
Example #17
0
    def write(self, make_backup=True):
        """Write the GUANO file to disk"""
        # FIXME: optionally write *other* subchunks for redundant metadata formats

        # prepare our metadata for a byte-wise representation
        md_bytes = bytearray(self._as_string(), 'utf-8')
        if len(md_bytes) % 2:
            md_bytes.append(ord('\n'))  # pad for alignment on even word boundary

        # create tempfile and write our vanilla .WAV ('data' sub-chunk only)
        tempfile = NamedTemporaryFile(mode='w+b', prefix='guano_temp-', suffix='.wav', delete=False)
        shutil.copystat(self.filename, tempfile.name)
        with closing(wave.Wave_write(tempfile)) as wavfile:
            wavfile.setparams(self.wav_params)
            wavfile.writeframes(self.wav_data)

        # add the 'guan' sub-chunk after the 'data' sub-chunk
        tempfile.seek(tempfile.tell())
        tempfile.write(struct.pack('<4sL', 'guan', len(md_bytes)))
        tempfile.write(md_bytes)

        # fix the RIFF file length
        total_size = tempfile.tell()
        tempfile.seek(0x04)
        tempfile.write(struct.pack('<L', total_size - 8))
        tempfile.close()

        # verify it
        GuanoFile(tempfile.name)

        # finally overwrite the original with our new version
        if make_backup:
            backup_dir = os.path.join(os.path.dirname(self.filename), 'GUANO_BACKUP')
            backup_file = os.path.join(backup_dir, os.path.basename(self.filename))
            if not os.path.isdir(backup_dir):
                print >> sys.stderr, 'Creating backup dir: ' + backup_dir
                os.mkdir(backup_dir)
            if os.path.exists(backup_file):
                os.remove(backup_file)
            os.rename(self.filename, backup_file)
        os.rename(tempfile.name, self.filename)
Example #18
0
    def _create_combined_success(self, success_fps):
        """
        Merge all success_fps into a single success_fp.

        Returns a new success_fp.
        """
        combined_fp = prev_fp = None
        combined_fp = NamedTemporaryFile(delete=True, mode='w+')
        try:
            prev_fp = NamedTemporaryFile(delete=True, mode='w+')  # start blank

            # Add all success_fps into combined_fp. Update prev_fp to
            # hold combined_fp.
            for added_fp in success_fps:
                if added_fp is None:
                    continue

                added_size = added_fp.tell()
                added_fp.seek(0)
                if added_size:
                    prev_size = prev_fp.tell()
                    prev_fp.seek(0)
                    log.info('Merging success lists (%d into %d)', added_size,
                             prev_size)
                    _comm(
                        _comm_input(prev_fp, added_fp),
                        _comm_actions(
                            # Keep it if in both:
                            both=(lambda e: combined_fp.write(e)),
                            # Keep it if we already had it:
                            leftonly=(lambda d: combined_fp.write(d)),
                            # Keep it if we added it now:
                            rightonly=(lambda a: combined_fp.write(a))))
                    combined_fp.flush()

                    # We don't need left anymore. Make combined the new left.
                    # Create new combined where we merge the next success_fp.
                    prev_fp.close()
                    prev_fp, combined_fp = combined_fp, None
                    combined_fp = NamedTemporaryFile(delete=True, mode='w+')

            # We want combined_fp at this point, but it's currently in
            # prev_fp. Note that the new combined_fp is at EOF (unseeked).
            combined_fp.close()
            combined_fp, prev_fp = prev_fp, None
        except Exception:
            if prev_fp:
                prev_fp.close()
            if combined_fp:
                combined_fp.close()
            raise

        return combined_fp
Example #19
0
 def page_download_task(page_num, r, page_url=None):
     ext = BaseChapter.guess_extension(r.headers.get("content-type"))
     f = NamedTemporaryFile(suffix=ext, delete=False)
     download_start_time = int(time.time())
     try:
         for chunk in r.iter_content(chunk_size=4096):
             if chunk:
                 f.write(chunk)
     except ConnectionError:
         f.flush()
         # page failed to download, send failure report
         if debug:
             output.warning("Mangadex API: send failure report")
         requests.post("https://api.mangadex.network/report",
                       data={
                           "url": page_url,
                           "success": False,
                           "bytes": f.tell(),
                           "duration":
                           int(time.time()) - download_start_time,
                           "cached":
                           True if r.headers.get("X-Cache") else False
                       })
         raise exceptions.ScrapingError
     f.flush()
     # page download successful, send success report
     if debug:
         output.warning("Mangadex API: send success report")
     requests.post("https://api.mangadex.network/report",
                   data={
                       "url": page_url,
                       "success": True,
                       "bytes": f.tell(),
                       "duration": int(time.time()) - download_start_time,
                       "cached": True if r.headers.get("X-Cache") else False
                   })
     f.close()
     r.close()
     return ((page_num, f))
def prepareUploadFile(prefix=""):
    """ Helper function for prerare file to uploading """
    fp = NamedTemporaryFile(mode='w+', prefix=prefix)
    fp.write("google-site-verification: " + fp.name)
    fp.seek(0,2)
    fsize = fp.tell()
    fp.seek(0)

    env = {'REQUEST_METHOD':'PUT'}
    headers = {'content-type':'text/plain',
               'content-length': fsize,
               'content-disposition':'attachment; filename=%s' % fp.name}
    fs = FieldStorage(fp=fp, environ=env, headers=headers)
    return FileUpload(fs), fp
Example #21
0
def make_blocks(num_records=2000, codec='null', write_to_disk=False):
    records = make_records(num_records)

    new_file = NamedTemporaryFile() if write_to_disk else MemoryIO()
    fastavro.writer(new_file, schema, records, codec=codec)
    bytes = new_file.tell()

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records, bytes
Example #22
0
def make_blocks(num_records=2000, codec='null', write_to_disk=False):
    records = make_records(num_records)

    new_file = NamedTemporaryFile() if write_to_disk else MemoryIO()
    fastavro.writer(new_file, schema, records, codec=codec)
    bytes = new_file.tell()

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records, bytes
Example #23
0
def make_image_file(dimensions=(320, 240),
                    prefix='tmp',
                    extension='.jpeg',
                    force_size=None,
                    orientation=None):
    """
    Yields a named temporary file created with the specified image type and
    options.

    Note the default dimensions are unequal (not a square) ensuring that center-square
    cropping logic will be exercised during tests.

    The temporary file will be closed and deleted automatically upon exiting
    the `with` block.

    prefix - To add prefix to random image file name, after adding will be like <custom-prefix><random-name>.png
            otherwise by default `tmp` is added making file name tmp<random-name>.png.

    """
    image = Image.new('RGB', dimensions, "green")
    image_file = NamedTemporaryFile(prefix=prefix, suffix=extension)
    try:
        if orientation and orientation in xrange(1, 9):
            exif_bytes = piexif.dump(
                {'0th': {
                    piexif.ImageIFD.Orientation: orientation
                }})
            image.save(image_file, exif=exif_bytes)
        else:
            image.save(image_file)
        if force_size is not None:
            image_file.seek(0, os.SEEK_END)
            bytes_to_pad = force_size - image_file.tell()
            # write in hunks of 256 bytes
            hunk, byte_ = bytearray([0] * 256), bytearray([0])
            num_hunks, remainder = divmod(bytes_to_pad, 256)
            for _ in xrange(num_hunks):
                image_file.write(hunk)
            for _ in xrange(remainder):
                image_file.write(byte_)
            image_file.flush()
        image_file.seek(0)
        yield image_file
    finally:
        image_file.close()
Example #24
0
def process_askue():
    e = Exporter()
    try:
        logging.debug('Trying to connect to FTP server...')
        with FTP(S.FTP_SERVER, S.FTP_USER, S.FTP_PASSWORD, timeout=5) as fc:
            logging.debug('Looking for files in FTP directory')
            # Find files and retrieve it
            inbox_files = fc.mlsd(S.REMS_PATH)
            filenames = [e[0] for e in inbox_files if askue_filename(e[0])]
            if not filenames:
                logging.info('Inbox directory is empty...')
                return
            if len(filenames) > 1:
                logging.debug(
                    'More than 1 file were found: {}'.format(
                        '\n'.join(filenames)))
            rfile = max(filenames, key=date_from_filename)
            logging.info('Retrieving {}...'.format(rfile))
            tf = NamedTemporaryFile()
            fc.retrbinary('RETR {}'.format(j(S.REMS_PATH, rfile)), tf.write)
            ftp_pos = tf.tell()
            try:
                if S.APPEND_ON:
                    lines = (record_to_csv(rec) for rec in
                             e.get_routes(datetime.now()))
                    append_lines(tf, lines)
                else:
                    logging.debug(
                        'Will not append lines (switched off in settings)')
            except Exception:
                logging.exception(
                    'Error appending lines to file! Sending as is')
                tf.seek(ftp_pos)
                tf.truncate()
            tf.seek(0)
            dest_path = j(S.IOMM_PATH, rfile)
            # Send file back to FTP
            logging.info('Sending file... {}'.format(dest_path))
            fc.storbinary('STOR {}'.format(dest_path), tf)
            logging.info('Cleaning up directory...')
            for fname in filenames:
                filepath = j(S.REMS_PATH, fname)
                fc.delete(filepath)
    finally:
        e.close_connection()
Example #25
0
    def download_url(self, url, partial_fetch=False):
        http_resp = self.http_session.get(url, stream=True, timeout=(60, 120))
        http_resp.raise_for_status()

        if not os.path.exists(TEMP_DIR_PATH):
            log.debug('Creating temp directory %s' % TEMP_DIR_PATH)
            os.makedirs(TEMP_DIR_PATH)

        # Create a temporary file to store the media item, write the file
        # to disk if it is larger than 1 MB.
        media_file = NamedTemporaryFile(dir=TEMP_DIR_PATH)

        # When a partial fetch is requested, request up to two MB
        partial_target_size = 1024 * 1024 * 2
        content_length = http_resp.headers.get('content-length')
        if content_length and int(content_length) < partial_target_size:
            partial_target_size = int(content_length)

        retrieved_bytes = 0
        for chunk in http_resp.iter_content(chunk_size=512 * 1024):
            if chunk:  # filter out keep-alive chunks
                media_file.write(chunk)
                retrieved_bytes += len(chunk)

            if partial_fetch and retrieved_bytes >= partial_target_size:
                break

        media_file.flush()
        log.debug('Fetched item %s [%s/%s]' % (url, retrieved_bytes, content_length))

        # If the server doens't provide a content-length and this isn't
        # a partial fetch, determine the size by looking at the retrieved
        # content
        if not content_length and not partial_fetch:
            media_file.seek(0, 2)
            content_length = media_file.tell()

        media_file.seek(0, 0)

        return (
            http_resp.headers.get('content-type'),
            content_length,
            media_file
        )
Example #26
0
    def download_url(self, url, partial_fetch=False):
        http_resp = self.http_session.get(url,
                                          stream=True,
                                          timeout=(60, 120),
                                          verify=False)
        http_resp.raise_for_status()

        # Create a temporary file to store the media item, write the file
        # to disk if it is larger than 1 MB.
        media_file = NamedTemporaryFile(delete=True)

        # When a partial fetch is requested, request up to two MB
        partial_target_size = 1024 * 1024 * 2
        content_length = http_resp.headers.get('content-length')
        if content_length and int(content_length) < partial_target_size:
            partial_target_size = int(content_length)

        retrieved_bytes = 0
        for chunk in http_resp.iter_content(chunk_size=512 * 1024):
            if chunk:  # filter out keep-alive chunks
                media_file.write(chunk)
                retrieved_bytes += len(chunk)

            if partial_fetch and retrieved_bytes >= partial_target_size:
                break

        media_file.flush()
        log.debug('Fetched item %s [%s/%s]' %
                  (url, retrieved_bytes, content_length))

        # If the server doens't provide a content-length and this isn't
        # a partial fetch, determine the size by looking at the retrieved
        # content
        if not content_length and not partial_fetch:
            media_file.seek(0, 2)
            content_length = media_file.tell()

        media_file.seek(0, 0)

        resource = FileResource(media_file)
        resource.content_type = http_resp.headers.get('content-type')
        resource.file_size = content_length
        return resource
Example #27
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        field_names = list(
            map(lambda schema_tuple: schema_tuple[0], cursor.description))
        mysql_types = list(
            map(lambda schema_tuple: schema_tuple[1], cursor.description))
        byte_fields = [
            self.is_binary(t, f)
            for t, f in zip(mysql_types, cursor.description_flags)
        ]

        file_no = 0
        tmp_file_handle = NamedTemporaryFile(mode='w', delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            # Convert datetime objects to utc seconds, decimals to floats, and binaries
            # to base64-encoded strings
            row_dict = {}
            for name, value, is_binary in zip(field_names, row, byte_fields):
                row_dict[name] = self.convert_types(value, is_binary)

            # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB.
            json.dump(row_dict, tmp_file_handle)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write('\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(mode='w', delete=True)
                tmp_file_handles[self.filename.format(
                    file_no)] = tmp_file_handle

        return tmp_file_handles
Example #28
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        class BinaryTypeEncoder(json.JSONEncoder):
            def default(self, obj):
                if PY3 and isinstance(obj, binary_type):
                    return str(obj, 'utf-8')
                return json.JSONEncoder.default(self, obj)

        schema = list(
            map(lambda schema_tuple: schema_tuple[0], cursor.description))
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            # Convert datetime objects to utc seconds, and decimals to floats
            row = map(self.convert_types, row)
            row_dict = dict(zip(schema, row))

            # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB.
            s = json.dumps(row_dict, cls=BinaryTypeEncoder)
            if PY3:
                s = s.encode('utf-8')
            tmp_file_handle.write(s)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(
                    file_no)] = tmp_file_handle

        return tmp_file_handles
Example #29
0
def csv_export(request, username, id_string):
    owner = get_object_or_404(User, username=username)
    xform = get_object_or_404(XForm, id_string=id_string, user=owner)
    if not has_permission(xform, owner, request):
        return HttpResponseForbidden(_(u"Not shared."))
    query = request.GET.get("query")
    csv_dataframe_builder = CSVDataFrameBuilder(username, id_string, query)
    try:
        temp_file = NamedTemporaryFile(suffix=".csv")
        csv_dataframe_builder.export_to(temp_file)
        if request.GET.get("raw"):
            id_string = None
        response = response_with_mimetype_and_name("application/csv", id_string, extension="csv")
        temp_file.seek(0)
        response.write(temp_file.read())
        temp_file.seek(0, os.SEEK_END)
        response["Content-Length"] = temp_file.tell()
        temp_file.close()
        return response
    except NoRecordsFoundError:
        return HttpResponse(_("No records found to export"))
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(
            map(lambda schema_tuple: schema_tuple[0], cursor.description))
        file_no = 0
        row_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        for row in cursor:
            # Convert datetime objects to utc seconds, and decimals to floats
            row = map(self.convert_types, row)
            row_dict = dict(zip(schema, row))

            s = json.dumps(row_dict, sort_keys=True)
            if PY3:
                s = s.encode("utf-8")
            tmp_file_handle.write(s)

            # Append newline to make dumps BigQuery compatible.
            tmp_file_handle.write(b"\n")

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(
                    file_no)] = tmp_file_handle
            row_no += 1

        self.log.info("Received %s rows over %s files", row_no, file_no + 1)

        return tmp_file_handles
Example #31
0
def file_reader(url, proxy, sslContext):
    local_file = NamedTemporaryFile(mode='w+b', prefix='subcontractor_')
    logging.debug('file_reader: downloading "{0}"'.format(url))
    resp = open_url(url, proxy, 200, sslContext)

    size = int(resp.headers['content-length'])

    buff = resp.read(4096 * 1024)
    cp = datetime.utcnow()
    while buff:
        if datetime.utcnow() > cp:
            cp = datetime.utcnow() + timedelta(seconds=PROGRESS_INTERVAL)
            logging.debug('file_reader: download at {0} of {1}'.format(
                local_file.tell(), size))

        local_file.write(buff)
        buff = resp.read(4096 * 1024)

    local_file.flush()
    local_file.seek(0)

    return local_file
Example #32
0
def xls_export(request, username, id_string):
    owner = get_object_or_404(User, username=username)
    xform = get_object_or_404(XForm, id_string=id_string, user=owner)
    if not has_permission(xform, owner, request):
        return HttpResponseForbidden(_(u'Not shared.'))
    query = request.GET.get("query")
    force_xlsx = request.GET.get('xlsx') == 'true'
    xls_df_builder = XLSDataFrameBuilder(username, id_string, query)
    excel_defs = {
      'xls': {
        'suffix': '.xls',
        'mime_type': 'vnd.ms-excel'
      },
      'xlsx': {
        'suffix': '.xlsx',
        'mime_type': 'vnd.openxmlformats' # TODO: check xlsx mime type
      }
    }
    ext = 'xls' if not force_xlsx else 'xlsx'
    if xls_df_builder.exceeds_xls_limits:
        ext = 'xlsx'
    try:
        temp_file = NamedTemporaryFile(suffix=excel_defs[ext]['suffix'])
        xls_df_builder.export_to(temp_file.name)

        if request.GET.get('raw'):
            id_string = None
        response = response_with_mimetype_and_name(excel_defs[ext]['mime_type'], id_string,
                                                   extension=ext)
        response.write(temp_file.read())
        temp_file.seek(0, os.SEEK_END)
        response['Content-Length'] = temp_file.tell()
        temp_file.close()
        return response
    except NoRecordsFoundError:
        return HttpResponse(_("No records found to export"))
def test_roundtrip3():
    if not PYTHON3:
        print(
            "test skipped because Python 2.x has problems creating Python 3.x files"
        )
        return
    fp = NamedTemporaryFile(mode="wb+",
                            suffix=".pyc",
                            prefix="test_pyc-",
                            delete=False)
    orig_path = "testdata/test_pyc.pyc"
    version, timestamp, magic_int, co, is_pypy, source_size, sip_hash = load_module(
        orig_path)
    write_pycfile(fp, [co], timestamp, version)
    new_path = fp.name
    size = fp.tell()
    fp.close()
    print("Wrote Python %s bytecode file %s; %d bytes" %
          (version, fp.name, size))
    old_fp = open(orig_path, "rb")
    new_fp = open(new_path, "rb")
    compare_size = 590
    assert old_fp.read(compare_size) == new_fp.read(compare_size)
    os.unlink(new_path)
Example #34
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(map(lambda schema_tuple: schema_tuple[0], cursor.description))
        col_type_dict = self._get_col_type_dict()
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        if self.export_format == 'csv':
            file_mime_type = 'text/csv'
        elif self.export_format == 'parquet':
            file_mime_type = 'application/octet-stream'
        else:
            file_mime_type = 'application/json'
        files_to_upload = [
            {
                'file_name': self.filename.format(file_no),
                'file_handle': tmp_file_handle,
                'file_mime_type': file_mime_type,
            }
        ]
        self.log.info("Current file count: %d", len(files_to_upload))

        if self.export_format == 'csv':
            csv_writer = self._configure_csv_file(tmp_file_handle, schema)
        if self.export_format == 'parquet':
            parquet_schema = self._convert_parquet_schema(cursor)
            # parquet_writer = self._configure_parquet_file(tmp_file_handle, parquet_schema)

        for row in cursor:
            # Convert datetime objects to utc seconds, and decimals to floats.
            # Convert binary type object to string encoded with base64.
            row = self.convert_types(schema, col_type_dict, row)

            if self.export_format == 'csv':
                if self.null_marker is not None:
                    row = [value if value is not None else self.null_marker for value in row]
                csv_writer.writerow(row)
            elif self.export_format == 'parquet':
                if self.null_marker is not None:
                    row = [value if value is not None else self.null_marker for value in row]
                row_pydic = {col: [value] for col, value in zip(schema, row)}
                tbl = pa.Table.from_pydict(row_pydic)
                with pq.ParquetWriter(tmp_file_handle, parquet_schema) as parquet_writer:
                    parquet_writer.write_table(tbl)
            else:
                row_dict = dict(zip(schema, row))

                tmp_file_handle.write(
                    json.dumps(row_dict, sort_keys=True, ensure_ascii=False).encode("utf-8")
                )

                # Append newline to make dumps BigQuery compatible.
                tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                files_to_upload.append(
                    {
                        'file_name': self.filename.format(file_no),
                        'file_handle': tmp_file_handle,
                        'file_mime_type': file_mime_type,
                    }
                )
                self.log.info("Current file count: %d", len(files_to_upload))
                if self.export_format == 'csv':
                    csv_writer = self._configure_csv_file(tmp_file_handle, schema)
        return files_to_upload
Example #35
0
class LazyZipOverHTTP:
    """File-like object mapped to a ZIP file over HTTP.

    This uses HTTP range requests to lazily fetch the file's content,
    which is supposed to be fed to ZipFile.  If such requests are not
    supported by the server, raise HTTPRangeRequestUnsupported
    during initialization.
    """
    def __init__(self,
                 url: str,
                 session: PipSession,
                 chunk_size: int = CONTENT_CHUNK_SIZE) -> None:
        head = session.head(url, headers=HEADERS)
        raise_for_status(head)
        assert head.status_code == 200
        self._session, self._url, self._chunk_size = session, url, chunk_size
        self._length = int(head.headers["Content-Length"])
        self._file = NamedTemporaryFile()
        self.truncate(self._length)
        self._left: List[int] = []
        self._right: List[int] = []
        if "bytes" not in head.headers.get("Accept-Ranges", "none"):
            raise HTTPRangeRequestUnsupported("range request is not supported")
        self._check_zip()

    @property
    def mode(self) -> str:
        """Opening mode, which is always rb."""
        return "rb"

    @property
    def name(self) -> str:
        """Path to the underlying file."""
        return self._file.name

    def seekable(self) -> bool:
        """Return whether random access is supported, which is True."""
        return True

    def close(self) -> None:
        """Close the file."""
        self._file.close()

    @property
    def closed(self) -> bool:
        """Whether the file is closed."""
        return self._file.closed

    def read(self, size: int = -1) -> bytes:
        """Read up to size bytes from the object and return them.

        As a convenience, if size is unspecified or -1,
        all bytes until EOF are returned.  Fewer than
        size bytes may be returned if EOF is reached.
        """
        download_size = max(size, self._chunk_size)
        start, length = self.tell(), self._length
        stop = length if size < 0 else min(start + download_size, length)
        start = max(0, stop - download_size)
        self._download(start, stop - 1)
        return self._file.read(size)

    def readable(self) -> bool:
        """Return whether the file is readable, which is True."""
        return True

    def seek(self, offset: int, whence: int = 0) -> int:
        """Change stream position and return the new absolute position.

        Seek to offset relative position indicated by whence:
        * 0: Start of stream (the default).  pos should be >= 0;
        * 1: Current position - pos may be negative;
        * 2: End of stream - pos usually negative.
        """
        return self._file.seek(offset, whence)

    def tell(self) -> int:
        """Return the current position."""
        return self._file.tell()

    def truncate(self, size: Optional[int] = None) -> int:
        """Resize the stream to the given size in bytes.

        If size is unspecified resize to the current position.
        The current stream position isn't changed.

        Return the new file size.
        """
        return self._file.truncate(size)

    def writable(self) -> bool:
        """Return False."""
        return False

    def __enter__(self) -> "LazyZipOverHTTP":
        self._file.__enter__()
        return self

    def __exit__(self, *exc: Any) -> Optional[bool]:
        return self._file.__exit__(*exc)

    @contextmanager
    def _stay(self) -> Iterator[None]:
        """Return a context manager keeping the position.

        At the end of the block, seek back to original position.
        """
        pos = self.tell()
        try:
            yield
        finally:
            self.seek(pos)

    def _check_zip(self) -> None:
        """Check and download until the file is a valid ZIP."""
        end = self._length - 1
        for start in reversed(range(0, end, self._chunk_size)):
            self._download(start, end)
            with self._stay():
                try:
                    # For read-only ZIP files, ZipFile only needs
                    # methods read, seek, seekable and tell.
                    ZipFile(self)  # type: ignore
                except BadZipfile:
                    pass
                else:
                    break

    def _stream_response(self,
                         start: int,
                         end: int,
                         base_headers: Dict[str, str] = HEADERS) -> Response:
        """Return HTTP response to a range request from start to end."""
        headers = base_headers.copy()
        headers["Range"] = f"bytes={start}-{end}"
        # TODO: Get range requests to be correctly cached
        headers["Cache-Control"] = "no-cache"
        return self._session.get(self._url, headers=headers, stream=True)

    def _merge(self, start: int, end: int, left: int,
               right: int) -> Iterator[Tuple[int, int]]:
        """Return an iterator of intervals to be fetched.

        Args:
            start (int): Start of needed interval
            end (int): End of needed interval
            left (int): Index of first overlapping downloaded data
            right (int): Index after last overlapping downloaded data
        """
        lslice, rslice = self._left[left:right], self._right[left:right]
        i = start = min([start] + lslice[:1])
        end = max([end] + rslice[-1:])
        for j, k in zip(lslice, rslice):
            if j > i:
                yield i, j - 1
            i = k + 1
        if i <= end:
            yield i, end
        self._left[left:right], self._right[left:right] = [start], [end]

    def _download(self, start: int, end: int) -> None:
        """Download bytes from start to end inclusively."""
        with self._stay():
            left = bisect_left(self._right, start)
            right = bisect_right(self._left, end)
            for start, end in self._merge(start, end, left, right):
                response = self._stream_response(start, end)
                response.raise_for_status()
                self.seek(start)
                for chunk in response_chunks(response, self._chunk_size):
                    self._file.write(chunk)
Example #36
0
class TestWait(unittest.TestCase):

    def setUp(self):
        self.file = NamedTemporaryFile()
        self.port = 9999
        self.patterns = ['foo', 'bar', 'f.*']

    def pattern(self, *args, **kwargs):
        return wait.log.pattern(self.file.name, *args, **kwargs)

    def write(self, s):
        self.file.write(s.encode('utf-8'))
        self.file.write('\n'.encode('utf-8'))
        self.file.flush()

    def test_log_exists(self):
        assert wait.log.exists(self.file.name)

    def test_log_exists_timeout(self):
        assert not wait.log.exists('/tmp/nolog', timeout=1)

    def test_log_pattern_list(self):
        seek = self.file.tell()
        self.write(self.patterns[0])
        self.write(self.patterns[1])
        assert self.pattern(self.patterns, seek=seek, timeout=5)

    def test_log_pattern_tuple(self):
        seek = self.file.tell()
        self.write(self.patterns[0])
        self.write(self.patterns[1])
        assert self.pattern(tuple(self.patterns), seek=seek, timeout=5)

    def test_log_pattern_string(self):
        seek = self.file.tell()
        self.write(self.patterns[0])
        assert self.pattern(self.patterns[0], seek=seek, timeout=5)

    def test_log_pattern_nostart(self):
        p = self.pattern(self.patterns, run=False, timeout=5)
        self.write(self.patterns[0])
        self.write(self.patterns[1])
        assert p()

    def test_log_pattern_timeout(self):
        assert not wait.log.pattern('/tmp/nolog', self.patterns, timeout=1)
        assert not self.pattern(self.patterns, timeout=1)

    def test_tcp_closed(self):
        assert wait.tcp.closed(self.port, timeout=1)
        assert not wait.tcp.open(self.port, timeout=1)

    def test_tcp_open(self):
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind(('localhost', self.port))
        s.listen(0)
        assert not wait.tcp.closed(self.port, timeout=1)
        assert wait.tcp.open(self.port, timeout=1)
        assert wait.tcp.open(80, host='www.google.com', timeout=5)
        s.close()

    def test_tcp_socket_timeout(self):
        assert wait.tcp.closed(self.port, host='10.255.255.1', timeout=1)
        assert not wait.tcp.open(self.port, host='10.255.255.1', timeout=1)

    def test_tcp_open_timeout(self):
        assert not wait.tcp.open(self.port, timeout=1)

    def tearDown(self):
        self.file.close()
Example #37
0
class TestWait(unittest.TestCase):
    def setUp(self):
        self.file = NamedTemporaryFile()
        self.port = 9999
        self.patterns = ['foo', 'bar', 'f.*']

    def pattern(self, *args, **kwargs):
        return wait.log.pattern(self.file.name, *args, **kwargs)

    def write(self, s):
        self.file.write(s.encode('utf-8'))
        self.file.write('\n'.encode('utf-8'))
        self.file.flush()

    def test_log_exists(self):
        assert wait.log.exists(self.file.name)

    def test_log_exists_timeout(self):
        assert not wait.log.exists('/tmp/nolog', timeout=1)

    def test_log_pattern_list(self):
        seek = self.file.tell()
        self.write(self.patterns[0])
        self.write(self.patterns[1])
        assert self.pattern(self.patterns, seek=seek, timeout=5)

    def test_log_pattern_tuple(self):
        seek = self.file.tell()
        self.write(self.patterns[0])
        self.write(self.patterns[1])
        assert self.pattern(tuple(self.patterns), seek=seek, timeout=5)

    def test_log_pattern_string(self):
        seek = self.file.tell()
        self.write(self.patterns[0])
        assert self.pattern(self.patterns[0], seek=seek, timeout=5)

    def test_log_pattern_nostart(self):
        p = self.pattern(self.patterns, run=False, timeout=5)
        self.write(self.patterns[0])
        self.write(self.patterns[1])
        assert p()

    def test_log_pattern_timeout(self):
        assert not wait.log.pattern('/tmp/nolog', self.patterns, timeout=1)
        assert not self.pattern(self.patterns, timeout=1)

    def test_tcp_closed(self):
        assert wait.tcp.closed(self.port, timeout=1)
        assert not wait.tcp.open(self.port, timeout=1)

    def test_tcp_open(self):
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind(('localhost', self.port))
        s.listen(0)
        assert not wait.tcp.closed(self.port, timeout=1)
        assert wait.tcp.open(self.port, timeout=1)
        assert wait.tcp.open(80, host='www.google.com', timeout=5)
        s.close()

    def test_tcp_socket_timeout(self):
        assert wait.tcp.closed(self.port, host='10.255.255.1', timeout=1)
        assert not wait.tcp.open(self.port, host='10.255.255.1', timeout=1)

    def test_tcp_open_timeout(self):
        assert not wait.tcp.open(self.port, timeout=1)

    def tearDown(self):
        self.file.close()
Example #38
0
    def getFile(self,
                uri,
                target_dir='/tmp',
                file_object=None,
                cb=None,
                timeout=30,
                chunk_size=(4096 * 1024)):
        """
    if file_object is defined:
       The file contense are written to it and the filename as specified by the
       server is returned, None is returned if not filename is detected.  The
       file_object is not closed. file_object must be opened with the 'b' attribute.

    Otherwise a file is created in target_dir, and the full path is returned.  If the
      filename is not specified by the server, and a random filename is chosen.
      WARNING: there isn't checking done to make sure the target file does not allready
      exist, there is a possibility it could clober something that allready exists.
      we do make sure the filename fits a regex pattern that prevents it from escaping
      the target_dir.  The "filename" as sent by the server is the "model" of the uri.
      make sure target_dir exists before calling getFile
    """

        uri_parser = URI('/')
        try:  # TODO: There has to be a better way to validate this uri
            (_, filename, _, _, _) = uri_parser.split(uri)
        except ValueError as e:
            raise InvalidRequest(str(e))

        # Due to the return value we have to do our own request, this is pretty much a stright GET
        url = '{0}{1}'.format(self.host, uri)
        req = request.Request(url)
        req.get_method = lambda: 'GET'
        try:
            resp = self.opener.open(req, timeout=timeout)

        except request.HTTPError as e:
            raise ResponseError('HTTPError "{0}"'.format(e))

        except request.URLError as e:
            if isinstance(e.reason, socket.timeout):
                raise Timeout(
                    'Request Timeout after {0} seconds'.format(timeout))

            raise ResponseError('URLError "{0}" for "{1}" via "{2}"'.format(
                e, url, self.proxy))

        http_code = resp.code
        if http_code != 200:
            logging.warning(
                'cinp: unexpected HTTP Code "{0}" for File Get'.format(
                    http_code))
            raise ResponseError(
                'Unexpected HTTP Code "{0}" for File Get'.format(http_code))

        try:
            size = resp.headers['Content-Length']
        except KeyError:
            size = 0

        if file_object is not None:
            file_writer = file_object

        else:
            if filename is None:
                file_writer = NamedTemporaryFile(dir=target_dir, mode='wb')
                filename = file_writer.name

            else:
                filename = os.path.join(target_dir, filename)
                file_writer = open(filename, 'wb')

        buff = resp.read(chunk_size)
        while buff:
            file_writer.write(buff)
            if cb:
                cb(file_writer.tell(), size)
            buff = resp.read(chunk_size)

        resp.close()

        if file_object is not None:
            return filename

        else:
            file_writer.close()
            return filename
Example #39
0
class File (object):
    """
    A file wrapper that smooths over some platform-specific
    operations.
    """
    def __init__(self, name=None, readonly=False, **kwargs):
        if name is None:
            self.file = NamedTemporaryFile(**kwargs)
        else:
            if exists(name):
                if readonly:
                    self.file = open(name, 'rb')
                else:
                    self.file = open(name, 'r+b')
            else:
                if readonly:
                    raise OSError('No "%s" found.' % name)
                self.file = open(name, 'w+b')
        if readonly:
            assert self.is_readonly()
        self.has_lock = False

    def get_name(self):
        return self.file.name

    def is_temporary(self):
        return isinstance(self.file, _TemporaryFileWrapper)

    def is_readonly(self):
        return self.file.mode == 'rb'

    def seek(self, n, whence=0):
        self.file.seek(n, whence)
        if whence == 0:
            assert self.file.tell() == n

    def seek_end(self):
        self.file.seek(0, 2)

    def read(self, n=None):
        if n is None:
            return self.file.read()
        else:
            return self.file.read(n)

    def tell(self):
        return self.file.tell()

    def stat(self):
        return os.stat(self.get_name())

    def __len__(self):
        return self.stat().st_size

    def rename(self, name):
        old_name = self.get_name()
        if name == old_name:
            return
        assert not self.is_temporary()
        self.obtain_lock()
        self.close()
        if exists(name):
            os.unlink(name)
        os.rename(old_name, name)
        self.file = open(name, 'r+b')
        self.obtain_lock()

    def obtain_lock(self):
        """
        Make sure that we have an exclusive lock on self.file before
        doing a write.
        If the lock is not available, raise an exception.
        """
        assert not self.is_readonly()
        if not self.has_lock:
            if os.name == 'nt':
                try:
                    win32file.LockFileEx(
                        win32file._get_osfhandle(self.file.fileno()),
                        (win32con.LOCKFILE_EXCLUSIVE_LOCK |
                         win32con.LOCKFILE_FAIL_IMMEDIATELY),
                        0, -65536, pywintypes.OVERLAPPED())
                except pywintypes.error:
                    raise IOError("Unable to obtain lock")
            else:
                fcntl.flock(self.file, fcntl.LOCK_EX | fcntl.LOCK_NB)
            self.has_lock = True

    def release_lock(self):
        """
        Make sure that we do not retain an exclusive lock on self.file.
        """
        if self.has_lock:
            if os.name == 'nt':
                win32file.UnlockFileEx(
                    win32file._get_osfhandle(self.file.fileno()),
                    0, -65536, pywintypes.OVERLAPPED())
            else:
                fcntl.flock(self.file, fcntl.LOCK_UN)
            self.has_lock = False

    def write(self, s):
        self.obtain_lock()
        self.file.write(s)
        # This flush helps the file knows where it ends.
        self.file.flush()

    def truncate(self):
        self.obtain_lock()
        self.file.truncate()

    def close(self):
        self.release_lock()
        self.file.close()

    def flush(self):
        self.file.flush()

    def fsync(self):
        if hasattr(os, 'fsync'):
            os.fsync(self.file)
Example #40
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        schema = list(
            map(lambda schema_tuple: schema_tuple[0], cursor.description))
        file_no = 0
        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handles = {self.filename.format(file_no): tmp_file_handle}

        # Save file header for csv if required
        if (self.export_format['file_format'] == 'csv'):

            # Deal with CSV formatting. Try to use dialect if passed
            if ('csv_dialect' in self.export_format):
                # Use dialect name from params
                dialect_name = self.export_format['csv_dialect']
            else:
                # Create internal dialect based on parameters passed
                dialect_name = 'mysql_to_gcs'
                csv.register_dialect(
                    dialect_name,
                    delimiter=self.export_format.get('csv_delimiter') or ',',
                    doublequote=self.export_format.get('csv_doublequote')
                    or 'True',
                    escapechar=self.export_format.get('csv_escapechar')
                    or None,
                    lineterminator=self.export_format.get('csv_lineterminator')
                    or '\r\n',
                    quotechar=self.export_format.get('csv_quotechar') or '"',
                    quoting=eval(
                        self.export_format.get('csv_quoting')
                        or 'csv.QUOTE_MINIMAL'))
            # Create CSV writer using either provided or generated dialect
            csv_writer = csv.writer(tmp_file_handle,
                                    encoding='utf-8',
                                    dialect=dialect_name)

            # Include column header in first row
            if ('csv_columnheader' in self.export_format
                    and eval(self.export_format['csv_columnheader'])):
                csv_writer.writerow(schema)

        for row in cursor:
            # Convert datetimes and longs to BigQuery safe types
            row = map(self.convert_types, row)

            # Save rows as CSV
            if (self.export_format['file_format'] == 'csv'):
                csv_writer.writerow(row)
            # Save rows as JSON
            else:
                # Convert datetime objects to utc seconds, and decimals to floats
                row_dict = dict(zip(schema, row))

                # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB.
                s = json.dumps(row_dict, sort_keys=True)
                if PY3:
                    s = s.encode('utf-8')
                tmp_file_handle.write(s)

                # Append newline to make dumps BigQuery compatible.
                tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1
                tmp_file_handle = NamedTemporaryFile(delete=True)
                tmp_file_handles[self.filename.format(
                    file_no)] = tmp_file_handle

                # For CSV files, weed to create a new writer with the new handle
                # and write header in first row
                if (self.export_format['file_format'] == 'csv'):
                    csv_writer = csv.writer(tmp_file_handle,
                                            encoding='utf-8',
                                            dialect=dialect_name)
                    if ('csv_columnheader' in self.export_format
                            and eval(self.export_format['csv_columnheader'])):
                        csv_writer.writerow(schema)

        return tmp_file_handles
Example #41
0
class Pyforce(object):
    def __init__(self, *args):
        """
        Create an iterator over results of a p4 call. The args here are p4
        CLI arguments. See p4 help for more information.
        """
        self.args = [str(arg) for arg in args]
        from subprocess import Popen, PIPE
        from tempfile import NamedTemporaryFile
        self.stderr = NamedTemporaryFile()
        if os.environ.get('DEBUG', ''):
            print(f'## p4', *self.args, file=sys.stderr)
        try:
            timeout = abs(int(os.environ['O4_P4_TIMEOUT']))
        except:
            timeout = 120
        self.pope = Popen(['p4', f'-vnet.maxwait={timeout}', '-G'] + self.args,
                          stdout=PIPE,
                          stderr=self.stderr)
        self.transform = Pyforce.to_str
        self.errors = []

    def __iter__(self):
        return self

    def __next__(self):
        """
        Returns the next p4 result object from the command. If the p4
        command experiences a timeout, raise P4TimeoutError. All other
        errors are accumulated during the run and raised as arguments
        on a single P4Error object after the p4 process has been
        exhausted.

        Certain errors are not really errors, it's just p4 being
        silly. Such as the error "No files to reconcile" when you
        reconcile files that have the correct content. Such errors are
        converted to code=stat and passed on. Some may also produce a
        '#o4pass'-prefixed line out stdout, which, in a complete run,
        will make their way to "o4 fail" and be reported.
        """
        import marshal
        try:
            while True:
                res = marshal.load(self.pope.stdout)
                if res.get(b'code') == b'info' and res.get(b'data', ''):
                    data = res.get(b'data')
                    ## Why was this upped to error?
                    #  b"is opened and not being changed" in data or b"must resolve" in data) and
                    if data.startswith(b'Diff chunks') and not data.endswith(
                            b'+ 0 conflicting'):
                        print("*** WARNING: There are conflicts.",
                              file=sys.stderr)
                    elif (b"can't move (already opened for edit)" in data
                          or b"is opened for add and can't be replaced" in data
                          or b"- resolve skipped" in data):
                        res[b'code'] = b'stat'
                        print(
                            f'#o4pass-warn#{data.decode("utf-8",errors="ignore")}'
                        )
                if res.get(b'code') != b'error':
                    return self.transform(res)
                if b'data' in res:
                    if (b'file(s) up-to-date' in res[b'data']
                            or b'no file(s) to reconcile' in res[b'data']
                            or b'no file(s) to resolve' in res[b'data']
                            or b'no file(s) to unshelve' in res[b'data']
                            or b'file(s) not on client' in res[b'data']
                            or b'No shelved files in changelist to delete'
                            in res[b'data']):
                        res[b'code'] = b'stat'
                    elif (b'no file(s) at that changelist number'
                          in res[b'data'] or
                          b'no revision(s) above those at that changelist number'
                          in res[b'data']):
                        # print('*** INFO: Skipping premature sync: ', res)
                        res[b'code'] = b'skip'
                    elif b'clobber writable file' in res[b'data']:
                        res[b'code'] = b'error'
                    # {b'code': b'error', b'data': b'SSL receive failed.\nread: Connection timed out: Connection timed out\n', b'severity': 3, b'generic': 38}
                    # 'data': 'TCP receive exceeded maximum configured duration of 60 seconds.\n', 'severity': 3, 'generic': 38
                    # This seems like it could be 100 different messages; we probably need #TODO find out what generic means.
                    elif b'Connection timed out' in res[
                            b'data'] or b'TCP receive exceeded' in res[b'data']:
                        raise P4TimeoutError(res, self.args)
                    if res[b'code'] != b'error':
                        return self.transform(res)
                # Allow operation to complete and report errors after
                self.errors.append(Pyforce.to_str(res))
        except EOFError:
            pass
        if self.stderr.tell():
            self.stderr.seek(0)
            err = self.stderr.read().decode(sys.stdout.encoding)
            if 'timed out' in err:
                raise P4TimeoutError(err)
            self.errors.append({
                'code': 'error',
                'data': f'stderr: {err}',
                'severity': 3,
                'generic': 38
            })
        if self.errors:
            raise P4Error(*self.errors)
        raise StopIteration()

    def __del__(self):
        if hasattr(self, 'pope'):
            try:
                self.pope.kill()
                self.pope.wait()
            except OSError:
                pass

    @staticmethod
    def to_str(r):
        """
        Converts a dictionary of bytes key-values to strings using stdout
        encoding.
        """
        def dec(a):
            if hasattr(a, 'decode'):
                return a.decode(sys.stdout.encoding, errors='ignore')
            return a

        return {dec(k): dec(v) for k, v in r.items()}

    @staticmethod
    def unescape(path):
        """Reverts p4 path escaping."""
        return path.replace('%40',
                            '@').replace('%23',
                                         '#').replace('%2a',
                                                      '*').replace('%25', '%')

    @staticmethod
    def escape(path):
        """Escapes a path like perforce would."""
        return path.replace('%', '%25').replace('#', '%23').replace(
            '*', '%2a').replace('@', '%40')

    @staticmethod
    def checksum(fname, fileSize):
        """
        Probably the only complete resource to how perforce computes a
        checksum. Fundamentally it's a MD5 checksum of the file's
        content. However utf16 files must first be converted to utf8,
        and if the file system file size is 3 bytes larger than the
        stated file size, then if those three bytes are the utf8 BOM,
        they must not be included in the checksum.

        Hence the fileSize argument can be an integer, or in the case
        of utf8 files <int>/utf8, and in the utf16 case <int>/utf16.
        """
        import hashlib
        hash_md5 = hashlib.md5()
        headType = ''
        if type(fileSize) != int:
            if '/' in fileSize:
                fileSize, headType = fileSize.split('/', 1)
            fileSize = int(fileSize)
        try:
            with open(fname, 'rb') as f:
                if headType == 'utf16':
                    # FIXME: Don't overflow and die if there is a giant utf16 file
                    u = f.read().decode('utf16')
                    hash_md5.update(u.encode('utf8'))
                else:
                    if headType == 'utf8':
                        fs = os.fstat(f.fileno())
                        if fs.st_size > fileSize:
                            # Skip utf8 BOM when computing digest, if filesize differs from st_size
                            bom = f.read(3)
                            if bom != b'\xef\xbb\xbf':
                                f.seek(0)
                    for chunk in iter(lambda: f.read(1024 * 1024), b''):
                        hash_md5.update(chunk)
            return hash_md5.hexdigest().upper()
        except FileNotFoundError:
            return None
Example #42
0
class AttachHTTP(AttachBase):
    """
    A wrapper for HTTP based attachment sources
    """

    # The default descriptive name associated with the service
    service_name = _('Web Based')

    # The default protocol
    protocol = 'http'

    # The default secure protocol
    secure_protocol = 'https'

    # The number of bytes in memory to read from the remote source at a time
    chunk_size = 8192

    # Web based requests are remote/external to our current location
    location = ContentLocation.HOSTED

    def __init__(self, headers=None, **kwargs):
        """
        Initialize HTTP Object

        headers can be a dictionary of key/value pairs that you want to
        additionally include as part of the server headers to post with

        """
        super(AttachHTTP, self).__init__(**kwargs)

        self.schema = 'https' if self.secure else 'http'

        self.fullpath = kwargs.get('fullpath')
        if not isinstance(self.fullpath, six.string_types):
            self.fullpath = '/'

        self.headers = {}
        if headers:
            # Store our extra headers
            self.headers.update(headers)

        # Where our content is written to upon a call to download.
        self._temp_file = None

        # Our Query String Dictionary; we use this to track arguments
        # specified that aren't otherwise part of this class
        self.qsd = {
            k: v
            for k, v in kwargs.get('qsd', {}).items()
            if k not in self.template_args
        }

        return

    def download(self, **kwargs):
        """
        Perform retrieval of the configuration based on the specified request
        """

        if self.location == ContentLocation.INACCESSIBLE:
            # our content is inaccessible
            return False

        # Ensure any existing content set has been invalidated
        self.invalidate()

        # prepare header
        headers = {
            'User-Agent': self.app_id,
        }

        # Apply any/all header over-rides defined
        headers.update(self.headers)

        auth = None
        if self.user:
            auth = (self.user, self.password)

        url = '%s://%s' % (self.schema, self.host)
        if isinstance(self.port, int):
            url += ':%d' % self.port

        url += self.fullpath

        self.logger.debug('HTTP POST URL: %s (cert_verify=%r)' % (
            url,
            self.verify_certificate,
        ))

        # Where our request object will temporarily live.
        r = None

        # Always call throttle before any remote server i/o is made
        self.throttle()

        try:
            # Make our request
            with requests.get(url,
                              headers=headers,
                              auth=auth,
                              params=self.qsd,
                              verify=self.verify_certificate,
                              timeout=self.request_timeout,
                              stream=True) as r:

                # Handle Errors
                r.raise_for_status()

                # Get our file-size (if known)
                try:
                    file_size = int(r.headers.get('Content-Length', '0'))
                except (TypeError, ValueError):
                    # Handle edge case where Content-Length is a bad value
                    file_size = 0

                # Perform a little Q/A on file limitations and restrictions
                if self.max_file_size > 0 and file_size > self.max_file_size:

                    # The content retrieved is to large
                    self.logger.error(
                        'HTTP response exceeds allowable maximum file length '
                        '({}KB): {}'.format(int(self.max_file_size / 1024),
                                            self.url(privacy=True)))

                    # Return False (signifying a failure)
                    return False

                # Detect config format based on mime if the format isn't
                # already enforced
                self.detected_mimetype = r.headers.get('Content-Type')

                d = r.headers.get('Content-Disposition', '')
                result = re.search("filename=['\"]?(?P<name>[^'\"]+)['\"]?", d,
                                   re.I)
                if result:
                    self.detected_name = result.group('name').strip()

                # Create a temporary file to work with
                self._temp_file = NamedTemporaryFile()

                # Get our chunk size
                chunk_size = self.chunk_size

                # Track all bytes written to disk
                bytes_written = 0

                # If we get here, we can now safely write our content to disk
                for chunk in r.iter_content(chunk_size=chunk_size):
                    # filter out keep-alive chunks
                    if chunk:
                        self._temp_file.write(chunk)
                        bytes_written = self._temp_file.tell()

                        # Prevent a case where Content-Length isn't provided
                        # we don't want to fetch beyond our limits
                        if self.max_file_size > 0:
                            if bytes_written > self.max_file_size:
                                # The content retrieved is to large
                                self.logger.error(
                                    'HTTP response exceeds allowable maximum '
                                    'file length ({}KB): {}'.format(
                                        int(self.max_file_size / 1024),
                                        self.url(privacy=True)))

                                # Invalidate any variables previously set
                                self.invalidate()

                                # Return False (signifying a failure)
                                return False

                            elif bytes_written + chunk_size \
                                    > self.max_file_size:
                                # Adjust out next read to accomodate up to our
                                # limit +1. This will prevent us from readig
                                # to much into our memory buffer
                                self.max_file_size - bytes_written + 1

                # Ensure our content is flushed to disk for post-processing
                self._temp_file.flush()

            # Set our minimum requirements for a successful download() call
            self.download_path = self._temp_file.name
            if not self.detected_name:
                self.detected_name = os.path.basename(self.fullpath)

        except requests.RequestException as e:
            self.logger.error('A Connection error occurred retrieving HTTP '
                              'configuration from %s.' % self.host)
            self.logger.debug('Socket Exception: %s' % str(e))

            # Invalidate any variables previously set
            self.invalidate()

            # Return False (signifying a failure)
            return False

        except (IOError, OSError):
            # IOError is present for backwards compatibility with Python
            # versions older then 3.3.  >= 3.3 throw OSError now.

            # Could not open and/or write the temporary file
            self.logger.error('Could not write attachment to disk: {}'.format(
                self.url(privacy=True)))

            # Invalidate any variables previously set
            self.invalidate()

            # Return False (signifying a failure)
            return False

        # Return our success
        return True

    def invalidate(self):
        """
        Close our temporary file
        """
        if self._temp_file:
            self._temp_file.close()
            self._temp_file = None

        super(AttachHTTP, self).invalidate()

    def url(self, privacy=False, *args, **kwargs):
        """
        Returns the URL built dynamically based on specified arguments.
        """

        # Our URL parameters
        params = self.url_parameters(privacy=privacy, *args, **kwargs)

        # Prepare our cache value
        if self.cache is not None:
            if isinstance(self.cache, bool) or not self.cache:
                cache = 'yes' if self.cache else 'no'
            else:
                cache = int(self.cache)

            # Set our cache value
            params['cache'] = cache

        if self._mimetype:
            # A format was enforced
            params['mime'] = self._mimetype

        if self._name:
            # A name was enforced
            params['name'] = self._name

        # Append our headers into our parameters
        params.update({'+{}'.format(k): v for k, v in self.headers.items()})

        # Apply any remaining entries to our URL
        params.update(self.qsd)

        # Determine Authentication
        auth = ''
        if self.user and self.password:
            auth = '{user}:{password}@'.format(
                user=self.quote(self.user, safe=''),
                password=self.pprint(self.password,
                                     privacy,
                                     mode=PrivacyMode.Secret,
                                     safe=''),
            )
        elif self.user:
            auth = '{user}@'.format(user=self.quote(self.user, safe=''), )

        default_port = 443 if self.secure else 80

        return '{schema}://{auth}{hostname}{port}{fullpath}?{params}'.format(
            schema=self.secure_protocol if self.secure else self.protocol,
            auth=auth,
            hostname=self.quote(self.host, safe=''),
            port='' if self.port is None or self.port == default_port else
            ':{}'.format(self.port),
            fullpath=self.quote(self.fullpath, safe='/'),
            params=self.urlencode(params),
        )

    @staticmethod
    def parse_url(url):
        """
        Parses the URL and returns enough arguments that can allow
        us to re-instantiate this object.

        """
        results = AttachBase.parse_url(url)

        if not results:
            # We're done early as we couldn't load the results
            return results

        # Add our headers that the user can potentially over-ride if they wish
        # to to our returned result set
        results['headers'] = results['qsd-']
        results['headers'].update(results['qsd+'])

        return results
Example #43
0
    def _write_local_data_files(self, cursor):
        """
        Takes a cursor, and writes results to a local file.

        :return: A dictionary where keys are filenames to be used as object
            names in GCS, and values are file handles to local files that
            contain the data for the GCS objects.
        """
        org_schema = list(
            map(lambda schema_tuple: schema_tuple[0], cursor.description))
        schema = [
            column for column in org_schema
            if column not in self.exclude_columns
        ]

        col_type_dict = self._get_col_type_dict()
        file_no = 0

        tmp_file_handle = NamedTemporaryFile(delete=True)
        if self.export_format == 'csv':
            file_mime_type = 'text/csv'
        elif self.export_format == 'parquet':
            file_mime_type = 'application/octet-stream'
        else:
            file_mime_type = 'application/json'
        file_to_upload = {
            'file_name': self.filename.format(file_no),
            'file_handle': tmp_file_handle,
            'file_mime_type': file_mime_type,
            'file_row_count': 0,
        }

        if self.export_format == 'csv':
            csv_writer = self._configure_csv_file(tmp_file_handle, schema)
        if self.export_format == 'parquet':
            parquet_schema = self._convert_parquet_schema(cursor)
            parquet_writer = self._configure_parquet_file(
                tmp_file_handle, parquet_schema)

        for row in cursor:
            file_to_upload['file_row_count'] += 1
            if self.export_format == 'csv':
                row = self.convert_types(schema, col_type_dict, row)
                if self.null_marker is not None:
                    row = [
                        value if value is not None else self.null_marker
                        for value in row
                    ]
                csv_writer.writerow(row)
            elif self.export_format == 'parquet':
                row = self.convert_types(schema, col_type_dict, row)
                if self.null_marker is not None:
                    row = [
                        value if value is not None else self.null_marker
                        for value in row
                    ]
                row_pydic = {col: [value] for col, value in zip(schema, row)}
                tbl = pa.Table.from_pydict(row_pydic, parquet_schema)
                parquet_writer.write_table(tbl)
            else:
                row = self.convert_types(schema,
                                         col_type_dict,
                                         row,
                                         stringify_dict=False)
                row_dict = dict(zip(schema, row))

                tmp_file_handle.write(
                    json.dumps(row_dict, sort_keys=True,
                               ensure_ascii=False).encode("utf-8"))

                # Append newline to make dumps BigQuery compatible.
                tmp_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit.
            if tmp_file_handle.tell() >= self.approx_max_file_size_bytes:
                file_no += 1

                if self.export_format == 'parquet':
                    parquet_writer.close()
                yield file_to_upload
                tmp_file_handle = NamedTemporaryFile(delete=True)
                file_to_upload = {
                    'file_name': self.filename.format(file_no),
                    'file_handle': tmp_file_handle,
                    'file_mime_type': file_mime_type,
                    'file_row_count': 0,
                }
                if self.export_format == 'csv':
                    csv_writer = self._configure_csv_file(
                        tmp_file_handle, schema)
                if self.export_format == 'parquet':
                    parquet_writer = self._configure_parquet_file(
                        tmp_file_handle, parquet_schema)
        if self.export_format == 'parquet':
            parquet_writer.close()
        # Last file may have 0 rows, don't yield if empty
        if file_to_upload['file_row_count'] > 0:
            yield file_to_upload
def test_binary_guesser():
    """Test bio_utils' binary_guesser with binary and text data"""

    # Store data
    binary_data = b'\x8e\xd2\x837U\xbc\\!H\xc8\xb1O\xac\x9e\xbf\xd4b\x82\xc9' \
                  b'\xd7\xaa\xb9\x16Uo5m\r\x00\x1e\xdd\x978\x00Rj\xe2Ng\xc3' \
                  b'=\xe6N}\x92\xf0(+\xa3\x99\\w\xe0\xa6\xb4\xa4\xc2\x90\x81' \
                  b'\xc4@\x10\x0f_\xdf\xdeo\r\xdc\xcd<\x7fq\x87\xb4\n\xcd' \
                  b'\xd2\r=\xfb\x84\xfb\xa5\xc0\x9e\xb4wl6j\xa9\xae\xe5\xc1' \
                  b'\xfb^\\L\xc8\x0b\xd1fU\xd1\xdd]\x06\x19\xf7\xc6\x90?x' \
                  b'\x06\x8ab\x0b\x14\xa4\x00z\x83\xe8\x90\x16@U\xba~\xbb' \
                  b'\xcf\x90\xb2\xdb>^A\xd1\xd45\xd7\xbc\x99\xf26\xf4\xa0' \
                  b'\x8f-\x04)\xf9[\x7f\xca\x81\xcd\x04\xefd\x9ci\xe8lH' \
                  b'\xce\xb8\xe6R\xe4#\xb5\x16\x97a\xd2\xda2\x1d\x9d\xb1#1 ' \
                  b'\xe1u\x04g2\xe4\xf0B\xa6\xcd\x00q\x9d=N\x1f\xf1%\xa6' \
                  b'\x89\xc2\xb4j\xeb\x90\x07>kJ\xefi\xd2tp\xb0\xf1\xb7' \
                  b'\xbb\xc8\xa8cZ\x0c\x88\xe2\x08\x0b\x05\xddS\x86\xa4s' \
                  b'\x1ck\x90\xa3\t(\x03n\xe8S\x8a\x03\xe3*\xb4\x02\x06%' \
                  b'\xfe2.?&\x13\x94\xea7\xd1\xb9\xef\xe1\x94Y\xbd58\xf4Y' \
                  b'\x13\xe9r\x90\x84\x0e{\xe2\x98\x12\xff\xf4f\x87J\xfc:' \
                  b'\xd7\xd9\xc6\xbf\xd3IU\xf5\\\xa1\xb0\xad\x04#\x9c\x0c' \
                  b'\x1d\x90\xbb\x93\xee\xbb\r\xa7\x96\t\x8b\xc1\x91\xecl' \
                  b'\xe1\x0f~3@\xa7\x98\re\x9b\x8fy\xb8U\x18\x04z\xe8\rT?' \
                  b'\xed\xb0\n\xf7*\xc8\xce\xb5N8\xaeh\x06\x84\'\xdd6SI' \
                  b'\xd6\xf9\xbdz\xd3\xab\xe3\xd9\xb3*BBd\xc0\x9d\xd6\x8a' \
                  b'\xb1\xe8\xc4\xb9\xacw|>\x80y\x86\xfcM!\x1b\xc9\xff\x93' \
                  b'\x8d\xb5\x89IL\x93J\x88\x0b\xe5\'\xbd\x13\xa9\xd5\xa0' \
                  b'\xe9Rs\xce,\x8e%\xdbQ\x85##I\x93\x04\xec\x98V\x8d\x9b' \
                  b'\xd9B9?z\'>Aq\x10`&\x0e\xa1\xb2\x94\x0c}"QI\x82\xf5.O' \
                  b'\x9a:uu|\xdd\x86^\xfd\x0bu\xbf05\xea\\e\xc7\\\xbe\xd9' \
                  b'\x98\x0fFo9\xb1\n`\xe9\x8ccg\n\x13\xcb\x1b!\xb2\xcdt|' \
                  b'\xc7!\xfawn3\xf0p\xb1n\xb6^\xe1;S\xa0\xf3y.\x8e\x83{' \
                  b'\x9f\x03\xa1\xfe\x8b\xae\xd4\xfa\xafh\xefP\x8c\xa0\xc1' \
                  b'\x8dWW\x85\xa0\xfeT\xa8\xa3\xe1\x85\x11G\x0f5\x83\xec' \
                  b'\xebvJ\x1a(\xbdk\x8c\xbbf\x81\x1d\xc0\x91[\x1c\x9d\xa4' \
                  b'\x0c\x81\xfe\x94-\xd9\xa0\xd3\x0c\xe0~\r\x8eZ\xc91>\xac' \
                  b'\x935\x94H\xfeN\x02\t\xe5\xb15X3\xcb3n\xec\x82\xbcl\x05' \
                  b'\xa7\x07X\xc6\x1a`\x1b\xd3\x85\x0c<c\x81K$\xb9#\x12h' \
                  b'\xa9gN\xce\x8f:\x0e\xe1r\xf2K\xc1\x05\xa5J6\x12\xf8\xd7' \
                  b'\xce\xcb@\xea\xb3\x0c]\x89\xe3\x9b)\xcd\x11\x06\x9bH4\n' \
                  b'\xad\xbd\xdb\x80U\r\x9e\xf6h$;Gov\xb3\x03\x88a\x81.MA' \
                  b'\x99\xc2\xc2Q\x1c=3c#)\xfb\xc1\x10f<xI\xef\xb2\xdcP' \
                  b'\xd9P\x1d\xc68\xec#-\xbd\xf2\x8c\x16a\xaa\x1a\xb6qb\x15' \
                  b'\xa8\xcct\xb8e\xc9\xbb\xd6S\x01 U\xcfw\xbd\xc0\xab\xb3l' \
                  b'\x1d\xd2\xa6k\x04\x06G_\x0e\x9bjam\xb4\xc4-\xcf\xad\x07c' \
                  b'\xf9"N\x8c\xe3r.\x0cq\xe2\x8c\x99\xd5\xa9\xfc\xbevRW7' \
                  b'\x17y\xfd\xbf\x9bq\t\x92\x1d\xc9\x19E\xd5\xedJ\xea9\xa4' \
                  b'\xd26~\xcc\x12\x9b\x12\xc4\x96(\xbe\xd7\x05-\xc9\x9f\x02' \
                  b'\xe2\x08f\xaf\'J\x0c\xb1\xcd\xa6\x80k)s\xa8\xbe\x15\x9d' \
                  b'\r}P2\xa1u\r~T\xedq\xa1X3o\x0b\xcb\x9dN\x8dAME\xe9\xcb\n' \
                  b'\xc6 ,\n\xa3\xba\x9a\x15\xc5-\xbaW\x89y?\xe3\x16 T!\xf0' \
                  b'\xf5\xfd\xa3Ks3\xb7\xe9F#\xdd\xebQ\xa9+#\xf9WG\x05\x93' \
                  b'\x93\x9a\x127\xf7d\xf2\x1cx\x9a2\x0fB\xber*\xc4\x90\xf8' \
                  b'\x07\xd7#\xf4\xff\xc0\xdcF\xd7<d\xb0\xdb\xcf\xa1\x1e' \
                  b'\xd2\x98\xde\xd1=u\xa6\xc4\x81\xf0\x04#x\xb6\xde\x0e\xbe' \
                  b'\xc6\x1b:\x10\x8f\xdf\xa3\x99E\xa2\xc2W\xde\xa7\x03\xe6x' \
                  b'\xc3\x07\x9d\xf1\x01$\x1d\xa1L\xad\xe8bnI\x14\xe7\xc1,'

    text_data = 'BGwrYz3oUOoys8NJQN0Ju43r28l/bdXne8YbOZWiPMMoZFyxp9Qmc4NK6k' \
                'Bs/DA2ZougW3RVZGAs\n3RRPLU78oRpTH3jzSViqj0jEtpMIwpOofhDjyP' \
                '8bM7/bHWIa9XruomgdnOxkttqMc/Mxj6ZcODlv\nGADtY86z+/VdfO9lDj' \
                'nwYmkkvjPN3qxpy6LIx9ZPMKpwCzTheidJR95u6gG+1ofA5HYaLIReujUn' \
                '\ntvtZKu49pmiEuz5tT0VWRPHR/7q2Eg5u7SZAhlWtOW+G/P7QkLFButy8' \
                'sArJwCBtEl6DH7B+L570\nZxfBaF1yaFU7VmZNL3e6MIq2Lgkk6TU3Ezvy' \
                'LMB1ZLt8Zpst4tL814fMmJ6QazUaafG73YQkmoVg\nGdbemZBu3CLxJ3iX' \
                'i9NPZxDionF9yNAt7gdiGqrVC3lRJIgSF1wn5/jqsdv8OhBI98DWOOYGmv' \
                'EJ\nM+DztfOx4KQpA4TSunCRK/2H6POolGN1gOXbteUZY4cA2FreVW15QG' \
                '/an30epRiKH/cgeNdEuIIe\niFsWt62tFTxXaQZZbc/p/hwUJ7iSMeYpq7' \
                'WgYmJQmkdHggKFFZniuI5VyE1YHqVu1bZEhLaI3XSJ\npGF9dvGRCamzGO' \
                'xLnz7TsjbVM45maSPXGJVw5OgZrZhqPdZNKgplblL8xvg//lRF582cYQFy' \
                'yM8X\nOGqN83/QKo02FwEdqGg6DD5zzbLys4K/HjYguARUHLMBziFCvq2x' \
                '9z31pSJUUCaBVit0Z4S4cCiK\narptw/91PnBJCdchBk0T62Kt4E41ClWV' \
                'OUWZcLKWVhW689HLrvO4YCBi+qZDtTJFK1cmahAh9xZj\n1KmfvZzM6QFB' \
                'RTtH2qzvEsgiA6lu9u1HS8ohHFxEYDJ32XKoNSQtarfOpjw/sA3kUaBi5a' \
                '1Josah\nXDyGoXSXdtVq2wdZLLf7uuwbTUZae6j+bl5R7dYTkKzhsaVmpU' \
                'zkrCHjl7XB+9YfpNwiCYPIfZSQ\nNluAEf2OeGozMipZ47fh9PMvWHri3g' \
                '8pA/7B9Nn8K3mSmEDLBBZgkcKynR6rtSgzj2hIX0qS0/iX\nihk5ZjvZiu' \
                'tqPiix6j+SSl59jk2WERh1IVHHWtBJUknbTlV3reTL+aWZHfkUioA0RSRi' \
                'cwBTY6ou\nnypnq8l4mPTWUCZReDz7N5OEGWquroD8Fv4+IB5EviVI6Xrj' \
                'Yil8m0rIjtbmwgFK0kSvkTEUI0DD\nCH3TY/+tXgLWA6scXG46T9+deuM0' \
                'F7H/+4iRfnLV1LMV8J+roIFcg3VPX1yBW4wryXNdERVNhbTk\nI/9c17pC' \
                '8fWqhv8kLBvcZcbzn6XDkKWXcQ6VOwiopYw/b6HaPDR7zSeBhNoPPJEw5q' \
                'q6ZSs2eA==\n'

    binary_handle = NamedTemporaryFile(mode='wb+')
    binary_handle.write(binary_data)
    binary_handle.seek(0)

    binary_guesser(binary_handle)
    assert binary_handle.tell() == 0

    text_handle = NamedTemporaryFile(mode='wt+')
    text_handle.write(text_data)
    text_handle.seek(256)

    try:
        binary_guesser(text_handle, num_bytes=128)
    except FormatError as error:
        assert error.message == '{0} is probably not a binary ' \
                                'file'.format(text_handle.name)
    assert text_handle.tell() == 256
Example #45
0
def commit_file(srcfile, user_id, extension):
    if USE_COMPRESSION:
        extension += '.gz'

    # We need to copy the file contents to the original location
    compfile = NamedTemporaryFile(prefix='twitter-', dir=TEMPORARY_DIRECTORY, delete=False)

    with profiled("Compressing output in %s"):
        with compressor(compfile) as gzfile:
            srcfile.seek(0)
            shutil.copyfileobj(srcfile, gzfile)
            log.msg("Output file size is %d bytes (%d bytes compressed)" % (gzfile.tell(), compfile.tell()))

        srcfile.close() # Delete the old plain file
        compfile.close()

    commit_file_compressed(compfile, user_id, extension)
Example #46
0
class CallbackFileWrapper(object):
    """
    Small wrapper around a fp object which will tee everything read into a
    buffer, and when that file is closed it will execute a callback with the
    contents of that buffer.

    All attributes are proxied to the underlying file object.

    This class uses members with a double underscore (__) leading prefix so as
    not to accidentally shadow an attribute.

    The data is stored in a temporary file until it is all available.  As long
    as the temporary files directory is disk-based (sometimes it's a
    memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory
    pressure is high.  For small files the disk usually won't be used at all,
    it'll all be in the filesystem memory cache, so there should be no
    performance impact.
    """
    def __init__(self, fp, callback):
        self.__buf = NamedTemporaryFile("rb+", delete=True)
        self.__fp = fp
        self.__callback = callback

    def __getattr__(self, name):
        # The vaguaries of garbage collection means that self.__fp is
        # not always set.  By using __getattribute__ and the private
        # name[0] allows looking up the attribute value and raising an
        # AttributeError when it doesn't exist. This stop thigns from
        # infinitely recursing calls to getattr in the case where
        # self.__fp hasn't been set.
        #
        # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers
        fp = self.__getattribute__("_CallbackFileWrapper__fp")
        return getattr(fp, name)

    def __is_fp_closed(self):
        try:
            return self.__fp.fp is None

        except AttributeError:
            pass

        try:
            return self.__fp.closed

        except AttributeError:
            pass

        # We just don't cache it then.
        # TODO: Add some logging here...
        return False

    def _close(self):
        if self.__callback:
            if self.__buf.tell() == 0:
                # Empty file:
                result = b""
            else:
                # Return the data without actually loading it into memory,
                # relying on Python's buffer API and mmap(). mmap() just gives
                # a view directly into the filesystem's memory cache, so it
                # doesn't result in duplicate memory use.
                self.__buf.seek(0, 0)
                result = memoryview(
                    mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ))
            self.__callback(result)

        # We assign this to None here, because otherwise we can get into
        # really tricky problems where the CPython interpreter dead locks
        # because the callback is holding a reference to something which
        # has a __del__ method. Setting this to None breaks the cycle
        # and allows the garbage collector to do it's thing normally.
        self.__callback = None

        # Closing the temporary file releases memory and frees disk space.
        # Important when caching big files.
        self.__buf.close()

    def read(self, amt=None):
        data = self.__fp.read(amt)
        if data:
            # We may be dealing with b'', a sign that things are over:
            # it's passed e.g. after we've already closed self.__buf.
            self.__buf.write(data)
        if self.__is_fp_closed():
            self._close()

        return data

    def _safe_read(self, amt):
        data = self.__fp._safe_read(amt)
        if amt == 2 and data == b"\r\n":
            # urllib executes this read to toss the CRLF at the end
            # of the chunk.
            return data

        self.__buf.write(data)
        if self.__is_fp_closed():
            self._close()

        return data
Example #47
0
class FileStorage(Storage):
    """
    Instance attributes:
      fp : file
      index : { oid:string : offset:int }
        Gives the offset of the current version of each oid.
      pending_records : { oid:str : record:str }
        Object records are accumulated here during a commit.
      pack_extra : [oid:str] | None
        oids of objects that have been committed after the pack began.  It is
        None if a pack is not in progress.
    """

    _PACK_INCREMENT = 20 # number of records to pack before yielding

    def __init__(self, filename=None, readonly=False, repair=False):
        """(filename:str=None, readonly:bool=False, repair:bool=False)
        If filename is empty (or None), a temporary file will be used.
        """
        self.oid = 0
        self.filename = filename
        if readonly:
            if not filename:
                raise ValueError(
                    "A filename is required for a readonly storage.")
            if repair:
                raise ValueError("A readonly storage can't be repaired.")
            self.fp = open(self.filename, 'rb')
        else:
            if not filename:
                self.fp = NamedTemporaryFile(suffix=".durus", mode="w+b")
            elif os.path.exists(self.filename):
                self.fp = open(self.filename, 'a+b')
            else:
                self.fp = open(self.filename, 'w+b')
            try:
                lock_file(self.fp)
            except IOError:
                self.fp.close()
                raise RuntimeError(
                    "\n  %s is locked."
                    "\n  There is probably a Durus storage server (or a client)"
                    "\n  using it.\n" % self.get_filename())
        self.pending_records = {}
        self.pack_extra = None
        self.repair = repair
        self._set_concrete_class_for_magic()
        self.index = {}
        self._build_index()
        max_oid = 0
        for oid in self.index:
            max_oid = max(max_oid, u64(oid))
        self.oid = max_oid

    def _set_concrete_class_for_magic(self):
        """
        FileStorage is an abstract class.
        The constructor calls this to set self.__class__ to a subclass
        that matches the format of the underlying file.
        If the underlying file is empty, this writes the magic
        string into the file.
        """
        if self.__class__ is FileStorage:
            for format in (FileStorage1, FileStorage2):
                self.fp.seek(0)
                self.__class__ = format
                if format.MAGIC == self.fp.read(len(format.MAGIC)):
                    return
        # Write header for new FileStorage2 file.
        self.fp.seek(0, 2)
        if self.fp.tell() != 0:
             raise IOError, "%r has no FileStorage magic" % self.fp
        self._write_header(self.fp)
        self._write_index(self.fp, {})

    def _write_header(self, fp):
        fp.seek(0, 2)
        assert fp.tell() == 0
        fp.write(self.MAGIC)

    def _write_index(self, fp, index):
        pass

    def get_size(self):
        return len(self.index)

    def new_oid(self):
        self.oid += 1
        return p64(self.oid)

    def load(self, oid):
        if self.fp is None:
            raise IOError, 'storage is closed'
        offset = self.index[oid]
        self.fp.seek(offset)
        return self._read_block()

    def begin(self):
        pass

    def store(self, oid, record):
        """Add a record during a commit."""
        self.pending_records[oid] = record

    def _generate_pending_records(self):
        for oid, record in self.pending_records.iteritems():
            yield oid, record

    def end(self, handle_invalidations=None):
        """Complete a commit.
        """
        if self.fp is None:
            raise IOError, 'storage is closed'
        index = {}
        for z in self._write_transaction(
            self.fp, self._generate_pending_records(), index):
            pass
        self.fp.flush()
        fsync(self.fp)
        self.index.update(index)
        if self.pack_extra is not None:
            self.pack_extra.extend(index)
        self.pending_records.clear()

    def sync(self):
        """
        A FileStorage is the storage of one StorageServer or one
        Connection, so there can never be any invalidations to transfer.
        """
        return []

    def get_filename(self):
        """() -> str
        The name of the file.
        If a tempfile is being used, the name will change when it is packed.
        """
        return self.filename or self.fp.name

    def _write_transaction(self, fp, records, index):
        fp.seek(0, 2)
        for i, (oid, record) in enumerate(records):
            full_record = self._disk_format(record)
            index[oid] = fp.tell()
            fp.write(p32(len(full_record)))
            fp.write(full_record)
            if i % self._PACK_INCREMENT == 0:
                yield None
        fp.write(p32(0)) # terminator

    def _disk_format(self, record):
        return record

    def _packer(self):
        if self.filename:
            prepack_name = self.filename + '.prepack'
            pack_name = self.filename + '.pack'
            packed = open(pack_name, 'w+b')
        else:
            packed = NamedTemporaryFile(suffix=".durus",
                                        mode="w+b")
        lock_file(packed)
        self._write_header(packed)
        def gen_reachable_records():
            todo = [ROOT_OID]
            seen = Set()
            while todo:
                oid = todo.pop()
                if oid in seen:
                    continue
                seen.add(oid)
                record = self.load(oid)
                record_oid, data, refdata = unpack_record(record)
                assert oid == record_oid
                todo.extend(split_oids(refdata))
                yield oid, record
            while self.pack_extra:
                oid = self.pack_extra.pop()
                yield oid, self.load(oid)
        index = {}
        for z in self._write_transaction(
            packed, gen_reachable_records(), index):
            yield None
        self._write_index(packed, index)
        packed.flush()
        fsync(packed)
        if self.filename:
            if not RENAME_OPEN_FILE:
                unlock_file(packed)
                packed.close()
            unlock_file(self.fp)
            self.fp.close()
            if os.path.exists(prepack_name): # for Win32
                os.unlink(prepack_name)
            os.rename(self.filename, prepack_name)
            os.rename(pack_name, self.filename)
            if RENAME_OPEN_FILE:
                self.fp = packed
            else:
                self.fp = open(self.filename, 'r+b')
                lock_file(self.fp)
        else: # tempfile
            unlock_file(self.fp)
            self.fp.close()
            self.fp = packed
        self.index = index
        self.pack_extra = None

    def get_packer(self):
        """Return an incremental packer (a generator).  Each time next() is
        called, up to _PACK_INCREMENT records will be packed.  Note that the
        generator must be exhaused before calling get_packer() again.
        """
        if self.fp is None:
            raise IOError, 'storage is closed'
        if self.fp.mode == 'rb':
            raise IOError, "read-only storage"
        assert not self.pending_records
        assert self.pack_extra is None
        self.pack_extra = []
        return self._packer()

    def pack(self):
        for z in self.get_packer():
            pass

    def gen_oid_record(self):
        """() -> sequence([(oid:str, record:str)])
        Generate oid, record pairs, for all oids in the database.
        Note that this may include oids that are not reachable from
        the root object.
        """
        for oid in self.index:
            yield oid, self.load(oid)

    def close(self):
        if self.fp is not None:
            unlock_file(self.fp)
            self.fp.close()
            self.fp = None

    def _read_block(self):
        size_str = self.fp.read(4)
        if len(size_str) == 0:
            raise IOError, "eof"
        size = u32(size_str)
        if size == 0:
            return ''
        result = self.fp.read(size)
        if len(result) != size:
            raise IOError, "short read"
        return result