class TombOutputThread(QtCore.QThread): line_received = QtCore.pyqtSignal(QtCore.QString) error_received = QtCore.pyqtSignal(QtCore.QString) progressed = QtCore.pyqtSignal(int) #value in percent def __init__(self): QtCore.QThread.__init__(self) self.buffer = NamedTemporaryFile() def run(self): while True: where = self.buffer.tell() line = self.buffer.readline() if not line: time.sleep(1) self.buffer.seek(where) else: #ansi color escapes messes this up, but it'ok anyway self.line_received.emit(line) self.parse_line(line) def parse_line(self, line): #This could be simplified, and s/search/match, if --no-color supported #see #59 #TODO: this should be moved to tomblib.parse parsed = parse_line(line) if parsed and parsed['type'] == 'error': self.error_received.emit(parsed.content)
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list( map(lambda schema_tuple: schema_tuple[0].replace(' ', '_'), cursor.description)) file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: # Convert if needed row = map(self.convert_types, row) row_dict = dict(zip(schema, row)) s = json.dumps(row_dict, sort_keys=True) s = s.encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BQ compatible tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format( file_no)] = tmp_file_handle return tmp_file_handles
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list( map(lambda schema_tuple: schema_tuple[0], cursor.description)) col_type_dict = self._get_col_type_dict() file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) if self.export_format == 'csv': file_mime_type = 'text/csv' else: file_mime_type = 'application/json' files_to_upload = [{ 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type }] if self.export_format == 'csv': csv_writer = self._configure_csv_file(tmp_file_handle, schema) for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats. # Convert binary type object to string encoded with base64. row = self._convert_types(schema, col_type_dict, row) if self.export_format == 'csv': csv_writer.writerow(row) else: row_dict = dict(zip(schema, row)) # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. s = json.dumps(row_dict, sort_keys=True).encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) files_to_upload.append({ 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type }) if self.export_format == 'csv': csv_writer = self._configure_csv_file( tmp_file_handle, schema) return files_to_upload
def xls_export(request, username, id_string): owner = get_object_or_404(User, username=username) xform = get_object_or_404(XForm, id_string=id_string, user=owner) if not has_permission(xform, owner, request): return HttpResponseForbidden(_(u"Not shared.")) query = request.GET.get("query") force_xlsx = request.GET.get("xlsx") == "true" xls_df_builder = XLSDataFrameBuilder(username, id_string, query) excel_defs = { "xls": {"suffix": ".xls", "mime_type": "vnd.ms-excel"}, "xlsx": {"suffix": ".xlsx", "mime_type": "vnd.openxmlformats"}, # TODO: check xlsx mime type } ext = "xls" if not force_xlsx else "xlsx" if xls_df_builder.exceeds_xls_limits: ext = "xlsx" try: temp_file = NamedTemporaryFile(suffix=excel_defs[ext]["suffix"]) xls_df_builder.export_to(temp_file.name) if request.GET.get("raw"): id_string = None response = response_with_mimetype_and_name(excel_defs[ext]["mime_type"], id_string, extension=ext) response.write(temp_file.read()) temp_file.seek(0, os.SEEK_END) response["Content-Length"] = temp_file.tell() temp_file.close() return response except NoRecordsFoundError: return HttpResponse(_("No records found to export"))
def make_image_file(dimensions=(320, 240), extension=".jpeg", force_size=None): """ Yields a named temporary file created with the specified image type and options. Note the default dimensions are unequal (not a square) ensuring that center-square cropping logic will be exercised during tests. The temporary file will be closed and deleted automatically upon exiting the `with` block. """ image = Image.new('RGB', dimensions, "green") image_file = NamedTemporaryFile(suffix=extension) try: image.save(image_file) if force_size is not None: image_file.seek(0, os.SEEK_END) bytes_to_pad = force_size - image_file.tell() # write in hunks of 256 bytes hunk, byte_ = bytearray([0] * 256), bytearray([0]) num_hunks, remainder = divmod(bytes_to_pad, 256) for _ in xrange(num_hunks): image_file.write(hunk) for _ in xrange(remainder): image_file.write(byte_) image_file.flush() image_file.seek(0) yield image_file finally: image_file.close()
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: self.log.info(row) if PY3: row = row.encode('utf-8') tmp_file_handle.write(row) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format( file_no)] = tmp_file_handle return tmp_file_handles
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list(map(lambda schema_tuple: schema_tuple[0], cursor.description)) file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats row = map(self.convert_types, row) row_dict = dict(zip(schema, row)) # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. s = json.dumps(row_dict) if PY3: s = s.encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle return tmp_file_handles
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list(map(lambda schema_tuple: schema_tuple[0].replace(' ', '_'), cursor.description)) file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: # Convert if needed row = map(self.convert_types, row) row_dict = dict(zip(schema, row)) s = json.dumps(row_dict, sort_keys=True) s = s.encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BQ compatible tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle return tmp_file_handles
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = map(lambda schema_tuple: schema_tuple[0], cursor.description) file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats row = map(self.convert_types, row) row_dict = dict(zip(schema, row)) # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. json.dump(row_dict, tmp_file_handle) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write('\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle return tmp_file_handles
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: row_dict = self.generate_data_dict(row._fields, row) content = json.dumps(row_dict).encode('utf-8') tmp_file_handle.write(content) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format( file_no)] = tmp_file_handle return tmp_file_handles
def make_image_file(dimensions=(320, 240), extension=".jpeg", force_size=None, orientation=None): """ Yields a named temporary file created with the specified image type and options. Note the default dimensions are unequal (not a square) ensuring that center-square cropping logic will be exercised during tests. The temporary file will be closed and deleted automatically upon exiting the `with` block. """ image = Image.new('RGB', dimensions, "green") image_file = NamedTemporaryFile(suffix=extension) try: if orientation and orientation in xrange(1, 9): exif_bytes = piexif.dump({'0th': {piexif.ImageIFD.Orientation: orientation}}) image.save(image_file, exif=exif_bytes) else: image.save(image_file) if force_size is not None: image_file.seek(0, os.SEEK_END) bytes_to_pad = force_size - image_file.tell() # write in hunks of 256 bytes hunk, byte_ = bytearray([0] * 256), bytearray([0]) num_hunks, remainder = divmod(bytes_to_pad, 256) for _ in xrange(num_hunks): image_file.write(hunk) for _ in xrange(remainder): image_file.write(byte_) image_file.flush() image_file.seek(0) yield image_file finally: image_file.close()
def make_image_file(dimensions=(320, 240), extension=".jpeg", force_size=None, orientation=None): """ Yields a named temporary file created with the specified image type and options. The temporary file will be closed and deleted automatically upon exiting the `with` block. """ image = Image.new('RGB', dimensions, "green") image_file = NamedTemporaryFile(suffix=extension) try: if orientation and orientation in xrange(1, 9): exif_bytes = piexif.dump({'0th': {piexif.ImageIFD.Orientation: orientation}}) image.save(image_file, exif=exif_bytes) else: image.save(image_file) if force_size is not None: image_file.seek(0, os.SEEK_END) bytes_to_pad = force_size - image_file.tell() # write in hunks of 256 bytes hunk, byte_ = bytearray([0] * 256), bytearray([0]) num_hunks, remainder = divmod(bytes_to_pad, 256) for _ in xrange(num_hunks): image_file.write(hunk) for _ in xrange(remainder): image_file.write(byte_) image_file.flush() image_file.seek(0) yield image_file finally: image_file.close()
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: row_dict = self.generate_data_dict(row._fields, row) s = json.dumps(row_dict) if PY3: s = s.encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle return tmp_file_handles
def convert_icon(filelike: IO[bytes], pixel_size: int) -> IO[bytes]: """Convert the input image (often ICO format) into a (crushed) 32x32 PNG. Crushing is important because there will be a lot of these images and we want to keep them small to reduce s3 costs and ensure that they are retained in HTTP caches as long as possible.""" image = Image.open(filelike) # Created a named temporary file, with auto deletion off try: temp_file = NamedTemporaryFile(mode="r+b", delete=False, prefix="quarchive-tmp-icon-", suffix=".png") resized = image.resize((pixel_size, pixel_size), resample=Image.LANCZOS) resized.save(temp_file, format="png") initial_size = temp_file.tell() # Close the handle to write out new image file to the fs temp_file.close() # pngcrush does a lot better than PIL at optimizing (50% or more) crushed_filename = f"{temp_file}.crushed" result = subprocess.run( ["pngcrush", temp_file.name, crushed_filename], capture_output=True, ) # Raise an exception if something went wrong try: result.check_returncode() except subprocess.CalledProcessError: log.error( "pngcrush failed: stdout='%s', stderr='%s'", result.stdout, result.stderr, ) raise # Open a handle to the new, crushed, png rv = open(crushed_filename, mode="r+b") # Log out the size reduction rv.seek(0, 2) crushed_size = rv.tell() rv.seek(0) log.debug("reduced image from %d bytes to %d", initial_size, crushed_size) finally: # clean up our temp file os.remove(temp_file.name) # then delete the underlying file (existing handle will continue to # work due to CoW) os.remove(crushed_filename) return rv
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list(map(lambda schema_tuple: schema_tuple[0], cursor.description)) col_type_dict = self._get_col_type_dict() file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) if self.export_format == 'csv': file_mime_type = 'text/csv' else: file_mime_type = 'application/json' files_to_upload = [{ 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type }] if self.export_format == 'csv': csv_writer = self._configure_csv_file(tmp_file_handle, schema) for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats. # Convert binary type object to string encoded with base64. row = self._convert_types(schema, col_type_dict, row) if self.export_format == 'csv': csv_writer.writerow(row) else: row_dict = dict(zip(schema, row)) # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. s = json.dumps(row_dict, sort_keys=True) if PY3: s = s.encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) files_to_upload.append({ 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type }) if self.export_format == 'csv': csv_writer = self._configure_csv_file(tmp_file_handle, schema) return files_to_upload
def write(self, make_backup=True): """ Write the GUANO .WAV file to disk. :param bool make_backup: create a backup file copy before writing changes or not (default: True); backups will be saved to a folder named `GUANO_BACKUP` :raises ValueError: if this `GuanoFile` doesn't represent a valid .WAV by having appropriate values for `self.wav_params` (see :meth:`wave.Wave_write.setparams()`) and `self.wav_data` (see :meth:`wave.Wave_write.writeframes()`) """ # FIXME: optionally write other unknown subchunks for redundant metadata formats if not self.filename: raise ValueError('Cannot write .WAV file without a self.filename!') if not self.wav_params: raise ValueError('Cannot write .WAV file without appropriate self.wav_params (see `wavfile.setparams()`)') if not self.wav_data: raise ValueError('Cannot write .WAV file without appropriate self.wav_data (see `wavfile.writeframes()`)') # prepare our metadata for a byte-wise representation md_bytes = self.serialize() # create tempfile and write our vanilla .WAV ('data' sub-chunk only) tempfile = NamedTemporaryFile(mode='w+b', prefix='guano_temp-', suffix='.wav', delete=False) if os.path.isfile(self.filename): shutil.copystat(self.filename, tempfile.name) with closing(wave.Wave_write(tempfile)) as wavfile: wavfile.setparams(self.wav_params) wavfile.writeframes(self.wav_data) # add the 'guan' sub-chunk after the 'data' sub-chunk tempfile.write(_chunkid.pack(b'guan')) tempfile.write(_chunksz.pack(len(md_bytes))) tempfile.write(md_bytes) # fix the RIFF file length total_size = tempfile.tell() tempfile.seek(0x04) tempfile.write(_chunksz.pack(total_size - 8)) tempfile.close() # verify it by re-parsing the new version GuanoFile(tempfile.name) # finally overwrite the original with our new version (and optionally back up first) if make_backup and os.path.exists(self.filename): backup_dir = os.path.join(os.path.dirname(self.filename), 'GUANO_BACKUP') backup_file = os.path.join(backup_dir, os.path.basename(self.filename)) if not os.path.isdir(backup_dir): log.debug('Creating backup dir: %s', backup_dir) os.mkdir(backup_dir) if os.path.exists(backup_file): os.remove(backup_file) os.rename(self.filename, backup_file) os.rename(tempfile.name, self.filename)
def write(self, make_backup=True): """Write the GUANO file to disk""" # FIXME: optionally write *other* subchunks for redundant metadata formats # prepare our metadata for a byte-wise representation md_bytes = bytearray(self._as_string(), 'utf-8') if len(md_bytes) % 2: md_bytes.append(ord('\n')) # pad for alignment on even word boundary # create tempfile and write our vanilla .WAV ('data' sub-chunk only) tempfile = NamedTemporaryFile(mode='w+b', prefix='guano_temp-', suffix='.wav', delete=False) shutil.copystat(self.filename, tempfile.name) with closing(wave.Wave_write(tempfile)) as wavfile: wavfile.setparams(self.wav_params) wavfile.writeframes(self.wav_data) # add the 'guan' sub-chunk after the 'data' sub-chunk tempfile.seek(tempfile.tell()) tempfile.write(struct.pack('<4sL', 'guan', len(md_bytes))) tempfile.write(md_bytes) # fix the RIFF file length total_size = tempfile.tell() tempfile.seek(0x04) tempfile.write(struct.pack('<L', total_size - 8)) tempfile.close() # verify it GuanoFile(tempfile.name) # finally overwrite the original with our new version if make_backup: backup_dir = os.path.join(os.path.dirname(self.filename), 'GUANO_BACKUP') backup_file = os.path.join(backup_dir, os.path.basename(self.filename)) if not os.path.isdir(backup_dir): print >> sys.stderr, 'Creating backup dir: ' + backup_dir os.mkdir(backup_dir) if os.path.exists(backup_file): os.remove(backup_file) os.rename(self.filename, backup_file) os.rename(tempfile.name, self.filename)
def _create_combined_success(self, success_fps): """ Merge all success_fps into a single success_fp. Returns a new success_fp. """ combined_fp = prev_fp = None combined_fp = NamedTemporaryFile(delete=True, mode='w+') try: prev_fp = NamedTemporaryFile(delete=True, mode='w+') # start blank # Add all success_fps into combined_fp. Update prev_fp to # hold combined_fp. for added_fp in success_fps: if added_fp is None: continue added_size = added_fp.tell() added_fp.seek(0) if added_size: prev_size = prev_fp.tell() prev_fp.seek(0) log.info('Merging success lists (%d into %d)', added_size, prev_size) _comm( _comm_input(prev_fp, added_fp), _comm_actions( # Keep it if in both: both=(lambda e: combined_fp.write(e)), # Keep it if we already had it: leftonly=(lambda d: combined_fp.write(d)), # Keep it if we added it now: rightonly=(lambda a: combined_fp.write(a)))) combined_fp.flush() # We don't need left anymore. Make combined the new left. # Create new combined where we merge the next success_fp. prev_fp.close() prev_fp, combined_fp = combined_fp, None combined_fp = NamedTemporaryFile(delete=True, mode='w+') # We want combined_fp at this point, but it's currently in # prev_fp. Note that the new combined_fp is at EOF (unseeked). combined_fp.close() combined_fp, prev_fp = prev_fp, None except Exception: if prev_fp: prev_fp.close() if combined_fp: combined_fp.close() raise return combined_fp
def page_download_task(page_num, r, page_url=None): ext = BaseChapter.guess_extension(r.headers.get("content-type")) f = NamedTemporaryFile(suffix=ext, delete=False) download_start_time = int(time.time()) try: for chunk in r.iter_content(chunk_size=4096): if chunk: f.write(chunk) except ConnectionError: f.flush() # page failed to download, send failure report if debug: output.warning("Mangadex API: send failure report") requests.post("https://api.mangadex.network/report", data={ "url": page_url, "success": False, "bytes": f.tell(), "duration": int(time.time()) - download_start_time, "cached": True if r.headers.get("X-Cache") else False }) raise exceptions.ScrapingError f.flush() # page download successful, send success report if debug: output.warning("Mangadex API: send success report") requests.post("https://api.mangadex.network/report", data={ "url": page_url, "success": True, "bytes": f.tell(), "duration": int(time.time()) - download_start_time, "cached": True if r.headers.get("X-Cache") else False }) f.close() r.close() return ((page_num, f))
def prepareUploadFile(prefix=""): """ Helper function for prerare file to uploading """ fp = NamedTemporaryFile(mode='w+', prefix=prefix) fp.write("google-site-verification: " + fp.name) fp.seek(0,2) fsize = fp.tell() fp.seek(0) env = {'REQUEST_METHOD':'PUT'} headers = {'content-type':'text/plain', 'content-length': fsize, 'content-disposition':'attachment; filename=%s' % fp.name} fs = FieldStorage(fp=fp, environ=env, headers=headers) return FileUpload(fs), fp
def make_blocks(num_records=2000, codec='null', write_to_disk=False): records = make_records(num_records) new_file = NamedTemporaryFile() if write_to_disk else MemoryIO() fastavro.writer(new_file, schema, records, codec=codec) bytes = new_file.tell() new_file.seek(0) block_reader = fastavro.block_reader(new_file, schema) blocks = list(block_reader) new_file.close() return blocks, records, bytes
def make_image_file(dimensions=(320, 240), prefix='tmp', extension='.jpeg', force_size=None, orientation=None): """ Yields a named temporary file created with the specified image type and options. Note the default dimensions are unequal (not a square) ensuring that center-square cropping logic will be exercised during tests. The temporary file will be closed and deleted automatically upon exiting the `with` block. prefix - To add prefix to random image file name, after adding will be like <custom-prefix><random-name>.png otherwise by default `tmp` is added making file name tmp<random-name>.png. """ image = Image.new('RGB', dimensions, "green") image_file = NamedTemporaryFile(prefix=prefix, suffix=extension) try: if orientation and orientation in xrange(1, 9): exif_bytes = piexif.dump( {'0th': { piexif.ImageIFD.Orientation: orientation }}) image.save(image_file, exif=exif_bytes) else: image.save(image_file) if force_size is not None: image_file.seek(0, os.SEEK_END) bytes_to_pad = force_size - image_file.tell() # write in hunks of 256 bytes hunk, byte_ = bytearray([0] * 256), bytearray([0]) num_hunks, remainder = divmod(bytes_to_pad, 256) for _ in xrange(num_hunks): image_file.write(hunk) for _ in xrange(remainder): image_file.write(byte_) image_file.flush() image_file.seek(0) yield image_file finally: image_file.close()
def process_askue(): e = Exporter() try: logging.debug('Trying to connect to FTP server...') with FTP(S.FTP_SERVER, S.FTP_USER, S.FTP_PASSWORD, timeout=5) as fc: logging.debug('Looking for files in FTP directory') # Find files and retrieve it inbox_files = fc.mlsd(S.REMS_PATH) filenames = [e[0] for e in inbox_files if askue_filename(e[0])] if not filenames: logging.info('Inbox directory is empty...') return if len(filenames) > 1: logging.debug( 'More than 1 file were found: {}'.format( '\n'.join(filenames))) rfile = max(filenames, key=date_from_filename) logging.info('Retrieving {}...'.format(rfile)) tf = NamedTemporaryFile() fc.retrbinary('RETR {}'.format(j(S.REMS_PATH, rfile)), tf.write) ftp_pos = tf.tell() try: if S.APPEND_ON: lines = (record_to_csv(rec) for rec in e.get_routes(datetime.now())) append_lines(tf, lines) else: logging.debug( 'Will not append lines (switched off in settings)') except Exception: logging.exception( 'Error appending lines to file! Sending as is') tf.seek(ftp_pos) tf.truncate() tf.seek(0) dest_path = j(S.IOMM_PATH, rfile) # Send file back to FTP logging.info('Sending file... {}'.format(dest_path)) fc.storbinary('STOR {}'.format(dest_path), tf) logging.info('Cleaning up directory...') for fname in filenames: filepath = j(S.REMS_PATH, fname) fc.delete(filepath) finally: e.close_connection()
def download_url(self, url, partial_fetch=False): http_resp = self.http_session.get(url, stream=True, timeout=(60, 120)) http_resp.raise_for_status() if not os.path.exists(TEMP_DIR_PATH): log.debug('Creating temp directory %s' % TEMP_DIR_PATH) os.makedirs(TEMP_DIR_PATH) # Create a temporary file to store the media item, write the file # to disk if it is larger than 1 MB. media_file = NamedTemporaryFile(dir=TEMP_DIR_PATH) # When a partial fetch is requested, request up to two MB partial_target_size = 1024 * 1024 * 2 content_length = http_resp.headers.get('content-length') if content_length and int(content_length) < partial_target_size: partial_target_size = int(content_length) retrieved_bytes = 0 for chunk in http_resp.iter_content(chunk_size=512 * 1024): if chunk: # filter out keep-alive chunks media_file.write(chunk) retrieved_bytes += len(chunk) if partial_fetch and retrieved_bytes >= partial_target_size: break media_file.flush() log.debug('Fetched item %s [%s/%s]' % (url, retrieved_bytes, content_length)) # If the server doens't provide a content-length and this isn't # a partial fetch, determine the size by looking at the retrieved # content if not content_length and not partial_fetch: media_file.seek(0, 2) content_length = media_file.tell() media_file.seek(0, 0) return ( http_resp.headers.get('content-type'), content_length, media_file )
def download_url(self, url, partial_fetch=False): http_resp = self.http_session.get(url, stream=True, timeout=(60, 120), verify=False) http_resp.raise_for_status() # Create a temporary file to store the media item, write the file # to disk if it is larger than 1 MB. media_file = NamedTemporaryFile(delete=True) # When a partial fetch is requested, request up to two MB partial_target_size = 1024 * 1024 * 2 content_length = http_resp.headers.get('content-length') if content_length and int(content_length) < partial_target_size: partial_target_size = int(content_length) retrieved_bytes = 0 for chunk in http_resp.iter_content(chunk_size=512 * 1024): if chunk: # filter out keep-alive chunks media_file.write(chunk) retrieved_bytes += len(chunk) if partial_fetch and retrieved_bytes >= partial_target_size: break media_file.flush() log.debug('Fetched item %s [%s/%s]' % (url, retrieved_bytes, content_length)) # If the server doens't provide a content-length and this isn't # a partial fetch, determine the size by looking at the retrieved # content if not content_length and not partial_fetch: media_file.seek(0, 2) content_length = media_file.tell() media_file.seek(0, 0) resource = FileResource(media_file) resource.content_type = http_resp.headers.get('content-type') resource.file_size = content_length return resource
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ field_names = list( map(lambda schema_tuple: schema_tuple[0], cursor.description)) mysql_types = list( map(lambda schema_tuple: schema_tuple[1], cursor.description)) byte_fields = [ self.is_binary(t, f) for t, f in zip(mysql_types, cursor.description_flags) ] file_no = 0 tmp_file_handle = NamedTemporaryFile(mode='w', delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: # Convert datetime objects to utc seconds, decimals to floats, and binaries # to base64-encoded strings row_dict = {} for name, value, is_binary in zip(field_names, row, byte_fields): row_dict[name] = self.convert_types(value, is_binary) # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. json.dump(row_dict, tmp_file_handle) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write('\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(mode='w', delete=True) tmp_file_handles[self.filename.format( file_no)] = tmp_file_handle return tmp_file_handles
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ class BinaryTypeEncoder(json.JSONEncoder): def default(self, obj): if PY3 and isinstance(obj, binary_type): return str(obj, 'utf-8') return json.JSONEncoder.default(self, obj) schema = list( map(lambda schema_tuple: schema_tuple[0], cursor.description)) file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats row = map(self.convert_types, row) row_dict = dict(zip(schema, row)) # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. s = json.dumps(row_dict, cls=BinaryTypeEncoder) if PY3: s = s.encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format( file_no)] = tmp_file_handle return tmp_file_handles
def csv_export(request, username, id_string): owner = get_object_or_404(User, username=username) xform = get_object_or_404(XForm, id_string=id_string, user=owner) if not has_permission(xform, owner, request): return HttpResponseForbidden(_(u"Not shared.")) query = request.GET.get("query") csv_dataframe_builder = CSVDataFrameBuilder(username, id_string, query) try: temp_file = NamedTemporaryFile(suffix=".csv") csv_dataframe_builder.export_to(temp_file) if request.GET.get("raw"): id_string = None response = response_with_mimetype_and_name("application/csv", id_string, extension="csv") temp_file.seek(0) response.write(temp_file.read()) temp_file.seek(0, os.SEEK_END) response["Content-Length"] = temp_file.tell() temp_file.close() return response except NoRecordsFoundError: return HttpResponse(_("No records found to export"))
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list( map(lambda schema_tuple: schema_tuple[0], cursor.description)) file_no = 0 row_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats row = map(self.convert_types, row) row_dict = dict(zip(schema, row)) s = json.dumps(row_dict, sort_keys=True) if PY3: s = s.encode("utf-8") tmp_file_handle.write(s) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b"\n") # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format( file_no)] = tmp_file_handle row_no += 1 self.log.info("Received %s rows over %s files", row_no, file_no + 1) return tmp_file_handles
def file_reader(url, proxy, sslContext): local_file = NamedTemporaryFile(mode='w+b', prefix='subcontractor_') logging.debug('file_reader: downloading "{0}"'.format(url)) resp = open_url(url, proxy, 200, sslContext) size = int(resp.headers['content-length']) buff = resp.read(4096 * 1024) cp = datetime.utcnow() while buff: if datetime.utcnow() > cp: cp = datetime.utcnow() + timedelta(seconds=PROGRESS_INTERVAL) logging.debug('file_reader: download at {0} of {1}'.format( local_file.tell(), size)) local_file.write(buff) buff = resp.read(4096 * 1024) local_file.flush() local_file.seek(0) return local_file
def xls_export(request, username, id_string): owner = get_object_or_404(User, username=username) xform = get_object_or_404(XForm, id_string=id_string, user=owner) if not has_permission(xform, owner, request): return HttpResponseForbidden(_(u'Not shared.')) query = request.GET.get("query") force_xlsx = request.GET.get('xlsx') == 'true' xls_df_builder = XLSDataFrameBuilder(username, id_string, query) excel_defs = { 'xls': { 'suffix': '.xls', 'mime_type': 'vnd.ms-excel' }, 'xlsx': { 'suffix': '.xlsx', 'mime_type': 'vnd.openxmlformats' # TODO: check xlsx mime type } } ext = 'xls' if not force_xlsx else 'xlsx' if xls_df_builder.exceeds_xls_limits: ext = 'xlsx' try: temp_file = NamedTemporaryFile(suffix=excel_defs[ext]['suffix']) xls_df_builder.export_to(temp_file.name) if request.GET.get('raw'): id_string = None response = response_with_mimetype_and_name(excel_defs[ext]['mime_type'], id_string, extension=ext) response.write(temp_file.read()) temp_file.seek(0, os.SEEK_END) response['Content-Length'] = temp_file.tell() temp_file.close() return response except NoRecordsFoundError: return HttpResponse(_("No records found to export"))
def test_roundtrip3(): if not PYTHON3: print( "test skipped because Python 2.x has problems creating Python 3.x files" ) return fp = NamedTemporaryFile(mode="wb+", suffix=".pyc", prefix="test_pyc-", delete=False) orig_path = "testdata/test_pyc.pyc" version, timestamp, magic_int, co, is_pypy, source_size, sip_hash = load_module( orig_path) write_pycfile(fp, [co], timestamp, version) new_path = fp.name size = fp.tell() fp.close() print("Wrote Python %s bytecode file %s; %d bytes" % (version, fp.name, size)) old_fp = open(orig_path, "rb") new_fp = open(new_path, "rb") compare_size = 590 assert old_fp.read(compare_size) == new_fp.read(compare_size) os.unlink(new_path)
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list(map(lambda schema_tuple: schema_tuple[0], cursor.description)) col_type_dict = self._get_col_type_dict() file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) if self.export_format == 'csv': file_mime_type = 'text/csv' elif self.export_format == 'parquet': file_mime_type = 'application/octet-stream' else: file_mime_type = 'application/json' files_to_upload = [ { 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type, } ] self.log.info("Current file count: %d", len(files_to_upload)) if self.export_format == 'csv': csv_writer = self._configure_csv_file(tmp_file_handle, schema) if self.export_format == 'parquet': parquet_schema = self._convert_parquet_schema(cursor) # parquet_writer = self._configure_parquet_file(tmp_file_handle, parquet_schema) for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats. # Convert binary type object to string encoded with base64. row = self.convert_types(schema, col_type_dict, row) if self.export_format == 'csv': if self.null_marker is not None: row = [value if value is not None else self.null_marker for value in row] csv_writer.writerow(row) elif self.export_format == 'parquet': if self.null_marker is not None: row = [value if value is not None else self.null_marker for value in row] row_pydic = {col: [value] for col, value in zip(schema, row)} tbl = pa.Table.from_pydict(row_pydic) with pq.ParquetWriter(tmp_file_handle, parquet_schema) as parquet_writer: parquet_writer.write_table(tbl) else: row_dict = dict(zip(schema, row)) tmp_file_handle.write( json.dumps(row_dict, sort_keys=True, ensure_ascii=False).encode("utf-8") ) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) files_to_upload.append( { 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type, } ) self.log.info("Current file count: %d", len(files_to_upload)) if self.export_format == 'csv': csv_writer = self._configure_csv_file(tmp_file_handle, schema) return files_to_upload
class LazyZipOverHTTP: """File-like object mapped to a ZIP file over HTTP. This uses HTTP range requests to lazily fetch the file's content, which is supposed to be fed to ZipFile. If such requests are not supported by the server, raise HTTPRangeRequestUnsupported during initialization. """ def __init__(self, url: str, session: PipSession, chunk_size: int = CONTENT_CHUNK_SIZE) -> None: head = session.head(url, headers=HEADERS) raise_for_status(head) assert head.status_code == 200 self._session, self._url, self._chunk_size = session, url, chunk_size self._length = int(head.headers["Content-Length"]) self._file = NamedTemporaryFile() self.truncate(self._length) self._left: List[int] = [] self._right: List[int] = [] if "bytes" not in head.headers.get("Accept-Ranges", "none"): raise HTTPRangeRequestUnsupported("range request is not supported") self._check_zip() @property def mode(self) -> str: """Opening mode, which is always rb.""" return "rb" @property def name(self) -> str: """Path to the underlying file.""" return self._file.name def seekable(self) -> bool: """Return whether random access is supported, which is True.""" return True def close(self) -> None: """Close the file.""" self._file.close() @property def closed(self) -> bool: """Whether the file is closed.""" return self._file.closed def read(self, size: int = -1) -> bytes: """Read up to size bytes from the object and return them. As a convenience, if size is unspecified or -1, all bytes until EOF are returned. Fewer than size bytes may be returned if EOF is reached. """ download_size = max(size, self._chunk_size) start, length = self.tell(), self._length stop = length if size < 0 else min(start + download_size, length) start = max(0, stop - download_size) self._download(start, stop - 1) return self._file.read(size) def readable(self) -> bool: """Return whether the file is readable, which is True.""" return True def seek(self, offset: int, whence: int = 0) -> int: """Change stream position and return the new absolute position. Seek to offset relative position indicated by whence: * 0: Start of stream (the default). pos should be >= 0; * 1: Current position - pos may be negative; * 2: End of stream - pos usually negative. """ return self._file.seek(offset, whence) def tell(self) -> int: """Return the current position.""" return self._file.tell() def truncate(self, size: Optional[int] = None) -> int: """Resize the stream to the given size in bytes. If size is unspecified resize to the current position. The current stream position isn't changed. Return the new file size. """ return self._file.truncate(size) def writable(self) -> bool: """Return False.""" return False def __enter__(self) -> "LazyZipOverHTTP": self._file.__enter__() return self def __exit__(self, *exc: Any) -> Optional[bool]: return self._file.__exit__(*exc) @contextmanager def _stay(self) -> Iterator[None]: """Return a context manager keeping the position. At the end of the block, seek back to original position. """ pos = self.tell() try: yield finally: self.seek(pos) def _check_zip(self) -> None: """Check and download until the file is a valid ZIP.""" end = self._length - 1 for start in reversed(range(0, end, self._chunk_size)): self._download(start, end) with self._stay(): try: # For read-only ZIP files, ZipFile only needs # methods read, seek, seekable and tell. ZipFile(self) # type: ignore except BadZipfile: pass else: break def _stream_response(self, start: int, end: int, base_headers: Dict[str, str] = HEADERS) -> Response: """Return HTTP response to a range request from start to end.""" headers = base_headers.copy() headers["Range"] = f"bytes={start}-{end}" # TODO: Get range requests to be correctly cached headers["Cache-Control"] = "no-cache" return self._session.get(self._url, headers=headers, stream=True) def _merge(self, start: int, end: int, left: int, right: int) -> Iterator[Tuple[int, int]]: """Return an iterator of intervals to be fetched. Args: start (int): Start of needed interval end (int): End of needed interval left (int): Index of first overlapping downloaded data right (int): Index after last overlapping downloaded data """ lslice, rslice = self._left[left:right], self._right[left:right] i = start = min([start] + lslice[:1]) end = max([end] + rslice[-1:]) for j, k in zip(lslice, rslice): if j > i: yield i, j - 1 i = k + 1 if i <= end: yield i, end self._left[left:right], self._right[left:right] = [start], [end] def _download(self, start: int, end: int) -> None: """Download bytes from start to end inclusively.""" with self._stay(): left = bisect_left(self._right, start) right = bisect_right(self._left, end) for start, end in self._merge(start, end, left, right): response = self._stream_response(start, end) response.raise_for_status() self.seek(start) for chunk in response_chunks(response, self._chunk_size): self._file.write(chunk)
class TestWait(unittest.TestCase): def setUp(self): self.file = NamedTemporaryFile() self.port = 9999 self.patterns = ['foo', 'bar', 'f.*'] def pattern(self, *args, **kwargs): return wait.log.pattern(self.file.name, *args, **kwargs) def write(self, s): self.file.write(s.encode('utf-8')) self.file.write('\n'.encode('utf-8')) self.file.flush() def test_log_exists(self): assert wait.log.exists(self.file.name) def test_log_exists_timeout(self): assert not wait.log.exists('/tmp/nolog', timeout=1) def test_log_pattern_list(self): seek = self.file.tell() self.write(self.patterns[0]) self.write(self.patterns[1]) assert self.pattern(self.patterns, seek=seek, timeout=5) def test_log_pattern_tuple(self): seek = self.file.tell() self.write(self.patterns[0]) self.write(self.patterns[1]) assert self.pattern(tuple(self.patterns), seek=seek, timeout=5) def test_log_pattern_string(self): seek = self.file.tell() self.write(self.patterns[0]) assert self.pattern(self.patterns[0], seek=seek, timeout=5) def test_log_pattern_nostart(self): p = self.pattern(self.patterns, run=False, timeout=5) self.write(self.patterns[0]) self.write(self.patterns[1]) assert p() def test_log_pattern_timeout(self): assert not wait.log.pattern('/tmp/nolog', self.patterns, timeout=1) assert not self.pattern(self.patterns, timeout=1) def test_tcp_closed(self): assert wait.tcp.closed(self.port, timeout=1) assert not wait.tcp.open(self.port, timeout=1) def test_tcp_open(self): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('localhost', self.port)) s.listen(0) assert not wait.tcp.closed(self.port, timeout=1) assert wait.tcp.open(self.port, timeout=1) assert wait.tcp.open(80, host='www.google.com', timeout=5) s.close() def test_tcp_socket_timeout(self): assert wait.tcp.closed(self.port, host='10.255.255.1', timeout=1) assert not wait.tcp.open(self.port, host='10.255.255.1', timeout=1) def test_tcp_open_timeout(self): assert not wait.tcp.open(self.port, timeout=1) def tearDown(self): self.file.close()
def getFile(self, uri, target_dir='/tmp', file_object=None, cb=None, timeout=30, chunk_size=(4096 * 1024)): """ if file_object is defined: The file contense are written to it and the filename as specified by the server is returned, None is returned if not filename is detected. The file_object is not closed. file_object must be opened with the 'b' attribute. Otherwise a file is created in target_dir, and the full path is returned. If the filename is not specified by the server, and a random filename is chosen. WARNING: there isn't checking done to make sure the target file does not allready exist, there is a possibility it could clober something that allready exists. we do make sure the filename fits a regex pattern that prevents it from escaping the target_dir. The "filename" as sent by the server is the "model" of the uri. make sure target_dir exists before calling getFile """ uri_parser = URI('/') try: # TODO: There has to be a better way to validate this uri (_, filename, _, _, _) = uri_parser.split(uri) except ValueError as e: raise InvalidRequest(str(e)) # Due to the return value we have to do our own request, this is pretty much a stright GET url = '{0}{1}'.format(self.host, uri) req = request.Request(url) req.get_method = lambda: 'GET' try: resp = self.opener.open(req, timeout=timeout) except request.HTTPError as e: raise ResponseError('HTTPError "{0}"'.format(e)) except request.URLError as e: if isinstance(e.reason, socket.timeout): raise Timeout( 'Request Timeout after {0} seconds'.format(timeout)) raise ResponseError('URLError "{0}" for "{1}" via "{2}"'.format( e, url, self.proxy)) http_code = resp.code if http_code != 200: logging.warning( 'cinp: unexpected HTTP Code "{0}" for File Get'.format( http_code)) raise ResponseError( 'Unexpected HTTP Code "{0}" for File Get'.format(http_code)) try: size = resp.headers['Content-Length'] except KeyError: size = 0 if file_object is not None: file_writer = file_object else: if filename is None: file_writer = NamedTemporaryFile(dir=target_dir, mode='wb') filename = file_writer.name else: filename = os.path.join(target_dir, filename) file_writer = open(filename, 'wb') buff = resp.read(chunk_size) while buff: file_writer.write(buff) if cb: cb(file_writer.tell(), size) buff = resp.read(chunk_size) resp.close() if file_object is not None: return filename else: file_writer.close() return filename
class File (object): """ A file wrapper that smooths over some platform-specific operations. """ def __init__(self, name=None, readonly=False, **kwargs): if name is None: self.file = NamedTemporaryFile(**kwargs) else: if exists(name): if readonly: self.file = open(name, 'rb') else: self.file = open(name, 'r+b') else: if readonly: raise OSError('No "%s" found.' % name) self.file = open(name, 'w+b') if readonly: assert self.is_readonly() self.has_lock = False def get_name(self): return self.file.name def is_temporary(self): return isinstance(self.file, _TemporaryFileWrapper) def is_readonly(self): return self.file.mode == 'rb' def seek(self, n, whence=0): self.file.seek(n, whence) if whence == 0: assert self.file.tell() == n def seek_end(self): self.file.seek(0, 2) def read(self, n=None): if n is None: return self.file.read() else: return self.file.read(n) def tell(self): return self.file.tell() def stat(self): return os.stat(self.get_name()) def __len__(self): return self.stat().st_size def rename(self, name): old_name = self.get_name() if name == old_name: return assert not self.is_temporary() self.obtain_lock() self.close() if exists(name): os.unlink(name) os.rename(old_name, name) self.file = open(name, 'r+b') self.obtain_lock() def obtain_lock(self): """ Make sure that we have an exclusive lock on self.file before doing a write. If the lock is not available, raise an exception. """ assert not self.is_readonly() if not self.has_lock: if os.name == 'nt': try: win32file.LockFileEx( win32file._get_osfhandle(self.file.fileno()), (win32con.LOCKFILE_EXCLUSIVE_LOCK | win32con.LOCKFILE_FAIL_IMMEDIATELY), 0, -65536, pywintypes.OVERLAPPED()) except pywintypes.error: raise IOError("Unable to obtain lock") else: fcntl.flock(self.file, fcntl.LOCK_EX | fcntl.LOCK_NB) self.has_lock = True def release_lock(self): """ Make sure that we do not retain an exclusive lock on self.file. """ if self.has_lock: if os.name == 'nt': win32file.UnlockFileEx( win32file._get_osfhandle(self.file.fileno()), 0, -65536, pywintypes.OVERLAPPED()) else: fcntl.flock(self.file, fcntl.LOCK_UN) self.has_lock = False def write(self, s): self.obtain_lock() self.file.write(s) # This flush helps the file knows where it ends. self.file.flush() def truncate(self): self.obtain_lock() self.file.truncate() def close(self): self.release_lock() self.file.close() def flush(self): self.file.flush() def fsync(self): if hasattr(os, 'fsync'): os.fsync(self.file)
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ schema = list( map(lambda schema_tuple: schema_tuple[0], cursor.description)) file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} # Save file header for csv if required if (self.export_format['file_format'] == 'csv'): # Deal with CSV formatting. Try to use dialect if passed if ('csv_dialect' in self.export_format): # Use dialect name from params dialect_name = self.export_format['csv_dialect'] else: # Create internal dialect based on parameters passed dialect_name = 'mysql_to_gcs' csv.register_dialect( dialect_name, delimiter=self.export_format.get('csv_delimiter') or ',', doublequote=self.export_format.get('csv_doublequote') or 'True', escapechar=self.export_format.get('csv_escapechar') or None, lineterminator=self.export_format.get('csv_lineterminator') or '\r\n', quotechar=self.export_format.get('csv_quotechar') or '"', quoting=eval( self.export_format.get('csv_quoting') or 'csv.QUOTE_MINIMAL')) # Create CSV writer using either provided or generated dialect csv_writer = csv.writer(tmp_file_handle, encoding='utf-8', dialect=dialect_name) # Include column header in first row if ('csv_columnheader' in self.export_format and eval(self.export_format['csv_columnheader'])): csv_writer.writerow(schema) for row in cursor: # Convert datetimes and longs to BigQuery safe types row = map(self.convert_types, row) # Save rows as CSV if (self.export_format['file_format'] == 'csv'): csv_writer.writerow(row) # Save rows as JSON else: # Convert datetime objects to utc seconds, and decimals to floats row_dict = dict(zip(schema, row)) # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. s = json.dumps(row_dict, sort_keys=True) if PY3: s = s.encode('utf-8') tmp_file_handle.write(s) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) tmp_file_handles[self.filename.format( file_no)] = tmp_file_handle # For CSV files, weed to create a new writer with the new handle # and write header in first row if (self.export_format['file_format'] == 'csv'): csv_writer = csv.writer(tmp_file_handle, encoding='utf-8', dialect=dialect_name) if ('csv_columnheader' in self.export_format and eval(self.export_format['csv_columnheader'])): csv_writer.writerow(schema) return tmp_file_handles
class Pyforce(object): def __init__(self, *args): """ Create an iterator over results of a p4 call. The args here are p4 CLI arguments. See p4 help for more information. """ self.args = [str(arg) for arg in args] from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile self.stderr = NamedTemporaryFile() if os.environ.get('DEBUG', ''): print(f'## p4', *self.args, file=sys.stderr) try: timeout = abs(int(os.environ['O4_P4_TIMEOUT'])) except: timeout = 120 self.pope = Popen(['p4', f'-vnet.maxwait={timeout}', '-G'] + self.args, stdout=PIPE, stderr=self.stderr) self.transform = Pyforce.to_str self.errors = [] def __iter__(self): return self def __next__(self): """ Returns the next p4 result object from the command. If the p4 command experiences a timeout, raise P4TimeoutError. All other errors are accumulated during the run and raised as arguments on a single P4Error object after the p4 process has been exhausted. Certain errors are not really errors, it's just p4 being silly. Such as the error "No files to reconcile" when you reconcile files that have the correct content. Such errors are converted to code=stat and passed on. Some may also produce a '#o4pass'-prefixed line out stdout, which, in a complete run, will make their way to "o4 fail" and be reported. """ import marshal try: while True: res = marshal.load(self.pope.stdout) if res.get(b'code') == b'info' and res.get(b'data', ''): data = res.get(b'data') ## Why was this upped to error? # b"is opened and not being changed" in data or b"must resolve" in data) and if data.startswith(b'Diff chunks') and not data.endswith( b'+ 0 conflicting'): print("*** WARNING: There are conflicts.", file=sys.stderr) elif (b"can't move (already opened for edit)" in data or b"is opened for add and can't be replaced" in data or b"- resolve skipped" in data): res[b'code'] = b'stat' print( f'#o4pass-warn#{data.decode("utf-8",errors="ignore")}' ) if res.get(b'code') != b'error': return self.transform(res) if b'data' in res: if (b'file(s) up-to-date' in res[b'data'] or b'no file(s) to reconcile' in res[b'data'] or b'no file(s) to resolve' in res[b'data'] or b'no file(s) to unshelve' in res[b'data'] or b'file(s) not on client' in res[b'data'] or b'No shelved files in changelist to delete' in res[b'data']): res[b'code'] = b'stat' elif (b'no file(s) at that changelist number' in res[b'data'] or b'no revision(s) above those at that changelist number' in res[b'data']): # print('*** INFO: Skipping premature sync: ', res) res[b'code'] = b'skip' elif b'clobber writable file' in res[b'data']: res[b'code'] = b'error' # {b'code': b'error', b'data': b'SSL receive failed.\nread: Connection timed out: Connection timed out\n', b'severity': 3, b'generic': 38} # 'data': 'TCP receive exceeded maximum configured duration of 60 seconds.\n', 'severity': 3, 'generic': 38 # This seems like it could be 100 different messages; we probably need #TODO find out what generic means. elif b'Connection timed out' in res[ b'data'] or b'TCP receive exceeded' in res[b'data']: raise P4TimeoutError(res, self.args) if res[b'code'] != b'error': return self.transform(res) # Allow operation to complete and report errors after self.errors.append(Pyforce.to_str(res)) except EOFError: pass if self.stderr.tell(): self.stderr.seek(0) err = self.stderr.read().decode(sys.stdout.encoding) if 'timed out' in err: raise P4TimeoutError(err) self.errors.append({ 'code': 'error', 'data': f'stderr: {err}', 'severity': 3, 'generic': 38 }) if self.errors: raise P4Error(*self.errors) raise StopIteration() def __del__(self): if hasattr(self, 'pope'): try: self.pope.kill() self.pope.wait() except OSError: pass @staticmethod def to_str(r): """ Converts a dictionary of bytes key-values to strings using stdout encoding. """ def dec(a): if hasattr(a, 'decode'): return a.decode(sys.stdout.encoding, errors='ignore') return a return {dec(k): dec(v) for k, v in r.items()} @staticmethod def unescape(path): """Reverts p4 path escaping.""" return path.replace('%40', '@').replace('%23', '#').replace('%2a', '*').replace('%25', '%') @staticmethod def escape(path): """Escapes a path like perforce would.""" return path.replace('%', '%25').replace('#', '%23').replace( '*', '%2a').replace('@', '%40') @staticmethod def checksum(fname, fileSize): """ Probably the only complete resource to how perforce computes a checksum. Fundamentally it's a MD5 checksum of the file's content. However utf16 files must first be converted to utf8, and if the file system file size is 3 bytes larger than the stated file size, then if those three bytes are the utf8 BOM, they must not be included in the checksum. Hence the fileSize argument can be an integer, or in the case of utf8 files <int>/utf8, and in the utf16 case <int>/utf16. """ import hashlib hash_md5 = hashlib.md5() headType = '' if type(fileSize) != int: if '/' in fileSize: fileSize, headType = fileSize.split('/', 1) fileSize = int(fileSize) try: with open(fname, 'rb') as f: if headType == 'utf16': # FIXME: Don't overflow and die if there is a giant utf16 file u = f.read().decode('utf16') hash_md5.update(u.encode('utf8')) else: if headType == 'utf8': fs = os.fstat(f.fileno()) if fs.st_size > fileSize: # Skip utf8 BOM when computing digest, if filesize differs from st_size bom = f.read(3) if bom != b'\xef\xbb\xbf': f.seek(0) for chunk in iter(lambda: f.read(1024 * 1024), b''): hash_md5.update(chunk) return hash_md5.hexdigest().upper() except FileNotFoundError: return None
class AttachHTTP(AttachBase): """ A wrapper for HTTP based attachment sources """ # The default descriptive name associated with the service service_name = _('Web Based') # The default protocol protocol = 'http' # The default secure protocol secure_protocol = 'https' # The number of bytes in memory to read from the remote source at a time chunk_size = 8192 # Web based requests are remote/external to our current location location = ContentLocation.HOSTED def __init__(self, headers=None, **kwargs): """ Initialize HTTP Object headers can be a dictionary of key/value pairs that you want to additionally include as part of the server headers to post with """ super(AttachHTTP, self).__init__(**kwargs) self.schema = 'https' if self.secure else 'http' self.fullpath = kwargs.get('fullpath') if not isinstance(self.fullpath, six.string_types): self.fullpath = '/' self.headers = {} if headers: # Store our extra headers self.headers.update(headers) # Where our content is written to upon a call to download. self._temp_file = None # Our Query String Dictionary; we use this to track arguments # specified that aren't otherwise part of this class self.qsd = { k: v for k, v in kwargs.get('qsd', {}).items() if k not in self.template_args } return def download(self, **kwargs): """ Perform retrieval of the configuration based on the specified request """ if self.location == ContentLocation.INACCESSIBLE: # our content is inaccessible return False # Ensure any existing content set has been invalidated self.invalidate() # prepare header headers = { 'User-Agent': self.app_id, } # Apply any/all header over-rides defined headers.update(self.headers) auth = None if self.user: auth = (self.user, self.password) url = '%s://%s' % (self.schema, self.host) if isinstance(self.port, int): url += ':%d' % self.port url += self.fullpath self.logger.debug('HTTP POST URL: %s (cert_verify=%r)' % ( url, self.verify_certificate, )) # Where our request object will temporarily live. r = None # Always call throttle before any remote server i/o is made self.throttle() try: # Make our request with requests.get(url, headers=headers, auth=auth, params=self.qsd, verify=self.verify_certificate, timeout=self.request_timeout, stream=True) as r: # Handle Errors r.raise_for_status() # Get our file-size (if known) try: file_size = int(r.headers.get('Content-Length', '0')) except (TypeError, ValueError): # Handle edge case where Content-Length is a bad value file_size = 0 # Perform a little Q/A on file limitations and restrictions if self.max_file_size > 0 and file_size > self.max_file_size: # The content retrieved is to large self.logger.error( 'HTTP response exceeds allowable maximum file length ' '({}KB): {}'.format(int(self.max_file_size / 1024), self.url(privacy=True))) # Return False (signifying a failure) return False # Detect config format based on mime if the format isn't # already enforced self.detected_mimetype = r.headers.get('Content-Type') d = r.headers.get('Content-Disposition', '') result = re.search("filename=['\"]?(?P<name>[^'\"]+)['\"]?", d, re.I) if result: self.detected_name = result.group('name').strip() # Create a temporary file to work with self._temp_file = NamedTemporaryFile() # Get our chunk size chunk_size = self.chunk_size # Track all bytes written to disk bytes_written = 0 # If we get here, we can now safely write our content to disk for chunk in r.iter_content(chunk_size=chunk_size): # filter out keep-alive chunks if chunk: self._temp_file.write(chunk) bytes_written = self._temp_file.tell() # Prevent a case where Content-Length isn't provided # we don't want to fetch beyond our limits if self.max_file_size > 0: if bytes_written > self.max_file_size: # The content retrieved is to large self.logger.error( 'HTTP response exceeds allowable maximum ' 'file length ({}KB): {}'.format( int(self.max_file_size / 1024), self.url(privacy=True))) # Invalidate any variables previously set self.invalidate() # Return False (signifying a failure) return False elif bytes_written + chunk_size \ > self.max_file_size: # Adjust out next read to accomodate up to our # limit +1. This will prevent us from readig # to much into our memory buffer self.max_file_size - bytes_written + 1 # Ensure our content is flushed to disk for post-processing self._temp_file.flush() # Set our minimum requirements for a successful download() call self.download_path = self._temp_file.name if not self.detected_name: self.detected_name = os.path.basename(self.fullpath) except requests.RequestException as e: self.logger.error('A Connection error occurred retrieving HTTP ' 'configuration from %s.' % self.host) self.logger.debug('Socket Exception: %s' % str(e)) # Invalidate any variables previously set self.invalidate() # Return False (signifying a failure) return False except (IOError, OSError): # IOError is present for backwards compatibility with Python # versions older then 3.3. >= 3.3 throw OSError now. # Could not open and/or write the temporary file self.logger.error('Could not write attachment to disk: {}'.format( self.url(privacy=True))) # Invalidate any variables previously set self.invalidate() # Return False (signifying a failure) return False # Return our success return True def invalidate(self): """ Close our temporary file """ if self._temp_file: self._temp_file.close() self._temp_file = None super(AttachHTTP, self).invalidate() def url(self, privacy=False, *args, **kwargs): """ Returns the URL built dynamically based on specified arguments. """ # Our URL parameters params = self.url_parameters(privacy=privacy, *args, **kwargs) # Prepare our cache value if self.cache is not None: if isinstance(self.cache, bool) or not self.cache: cache = 'yes' if self.cache else 'no' else: cache = int(self.cache) # Set our cache value params['cache'] = cache if self._mimetype: # A format was enforced params['mime'] = self._mimetype if self._name: # A name was enforced params['name'] = self._name # Append our headers into our parameters params.update({'+{}'.format(k): v for k, v in self.headers.items()}) # Apply any remaining entries to our URL params.update(self.qsd) # Determine Authentication auth = '' if self.user and self.password: auth = '{user}:{password}@'.format( user=self.quote(self.user, safe=''), password=self.pprint(self.password, privacy, mode=PrivacyMode.Secret, safe=''), ) elif self.user: auth = '{user}@'.format(user=self.quote(self.user, safe=''), ) default_port = 443 if self.secure else 80 return '{schema}://{auth}{hostname}{port}{fullpath}?{params}'.format( schema=self.secure_protocol if self.secure else self.protocol, auth=auth, hostname=self.quote(self.host, safe=''), port='' if self.port is None or self.port == default_port else ':{}'.format(self.port), fullpath=self.quote(self.fullpath, safe='/'), params=self.urlencode(params), ) @staticmethod def parse_url(url): """ Parses the URL and returns enough arguments that can allow us to re-instantiate this object. """ results = AttachBase.parse_url(url) if not results: # We're done early as we couldn't load the results return results # Add our headers that the user can potentially over-ride if they wish # to to our returned result set results['headers'] = results['qsd-'] results['headers'].update(results['qsd+']) return results
def _write_local_data_files(self, cursor): """ Takes a cursor, and writes results to a local file. :return: A dictionary where keys are filenames to be used as object names in GCS, and values are file handles to local files that contain the data for the GCS objects. """ org_schema = list( map(lambda schema_tuple: schema_tuple[0], cursor.description)) schema = [ column for column in org_schema if column not in self.exclude_columns ] col_type_dict = self._get_col_type_dict() file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) if self.export_format == 'csv': file_mime_type = 'text/csv' elif self.export_format == 'parquet': file_mime_type = 'application/octet-stream' else: file_mime_type = 'application/json' file_to_upload = { 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type, 'file_row_count': 0, } if self.export_format == 'csv': csv_writer = self._configure_csv_file(tmp_file_handle, schema) if self.export_format == 'parquet': parquet_schema = self._convert_parquet_schema(cursor) parquet_writer = self._configure_parquet_file( tmp_file_handle, parquet_schema) for row in cursor: file_to_upload['file_row_count'] += 1 if self.export_format == 'csv': row = self.convert_types(schema, col_type_dict, row) if self.null_marker is not None: row = [ value if value is not None else self.null_marker for value in row ] csv_writer.writerow(row) elif self.export_format == 'parquet': row = self.convert_types(schema, col_type_dict, row) if self.null_marker is not None: row = [ value if value is not None else self.null_marker for value in row ] row_pydic = {col: [value] for col, value in zip(schema, row)} tbl = pa.Table.from_pydict(row_pydic, parquet_schema) parquet_writer.write_table(tbl) else: row = self.convert_types(schema, col_type_dict, row, stringify_dict=False) row_dict = dict(zip(schema, row)) tmp_file_handle.write( json.dumps(row_dict, sort_keys=True, ensure_ascii=False).encode("utf-8")) # Append newline to make dumps BigQuery compatible. tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 if self.export_format == 'parquet': parquet_writer.close() yield file_to_upload tmp_file_handle = NamedTemporaryFile(delete=True) file_to_upload = { 'file_name': self.filename.format(file_no), 'file_handle': tmp_file_handle, 'file_mime_type': file_mime_type, 'file_row_count': 0, } if self.export_format == 'csv': csv_writer = self._configure_csv_file( tmp_file_handle, schema) if self.export_format == 'parquet': parquet_writer = self._configure_parquet_file( tmp_file_handle, parquet_schema) if self.export_format == 'parquet': parquet_writer.close() # Last file may have 0 rows, don't yield if empty if file_to_upload['file_row_count'] > 0: yield file_to_upload
def test_binary_guesser(): """Test bio_utils' binary_guesser with binary and text data""" # Store data binary_data = b'\x8e\xd2\x837U\xbc\\!H\xc8\xb1O\xac\x9e\xbf\xd4b\x82\xc9' \ b'\xd7\xaa\xb9\x16Uo5m\r\x00\x1e\xdd\x978\x00Rj\xe2Ng\xc3' \ b'=\xe6N}\x92\xf0(+\xa3\x99\\w\xe0\xa6\xb4\xa4\xc2\x90\x81' \ b'\xc4@\x10\x0f_\xdf\xdeo\r\xdc\xcd<\x7fq\x87\xb4\n\xcd' \ b'\xd2\r=\xfb\x84\xfb\xa5\xc0\x9e\xb4wl6j\xa9\xae\xe5\xc1' \ b'\xfb^\\L\xc8\x0b\xd1fU\xd1\xdd]\x06\x19\xf7\xc6\x90?x' \ b'\x06\x8ab\x0b\x14\xa4\x00z\x83\xe8\x90\x16@U\xba~\xbb' \ b'\xcf\x90\xb2\xdb>^A\xd1\xd45\xd7\xbc\x99\xf26\xf4\xa0' \ b'\x8f-\x04)\xf9[\x7f\xca\x81\xcd\x04\xefd\x9ci\xe8lH' \ b'\xce\xb8\xe6R\xe4#\xb5\x16\x97a\xd2\xda2\x1d\x9d\xb1#1 ' \ b'\xe1u\x04g2\xe4\xf0B\xa6\xcd\x00q\x9d=N\x1f\xf1%\xa6' \ b'\x89\xc2\xb4j\xeb\x90\x07>kJ\xefi\xd2tp\xb0\xf1\xb7' \ b'\xbb\xc8\xa8cZ\x0c\x88\xe2\x08\x0b\x05\xddS\x86\xa4s' \ b'\x1ck\x90\xa3\t(\x03n\xe8S\x8a\x03\xe3*\xb4\x02\x06%' \ b'\xfe2.?&\x13\x94\xea7\xd1\xb9\xef\xe1\x94Y\xbd58\xf4Y' \ b'\x13\xe9r\x90\x84\x0e{\xe2\x98\x12\xff\xf4f\x87J\xfc:' \ b'\xd7\xd9\xc6\xbf\xd3IU\xf5\\\xa1\xb0\xad\x04#\x9c\x0c' \ b'\x1d\x90\xbb\x93\xee\xbb\r\xa7\x96\t\x8b\xc1\x91\xecl' \ b'\xe1\x0f~3@\xa7\x98\re\x9b\x8fy\xb8U\x18\x04z\xe8\rT?' \ b'\xed\xb0\n\xf7*\xc8\xce\xb5N8\xaeh\x06\x84\'\xdd6SI' \ b'\xd6\xf9\xbdz\xd3\xab\xe3\xd9\xb3*BBd\xc0\x9d\xd6\x8a' \ b'\xb1\xe8\xc4\xb9\xacw|>\x80y\x86\xfcM!\x1b\xc9\xff\x93' \ b'\x8d\xb5\x89IL\x93J\x88\x0b\xe5\'\xbd\x13\xa9\xd5\xa0' \ b'\xe9Rs\xce,\x8e%\xdbQ\x85##I\x93\x04\xec\x98V\x8d\x9b' \ b'\xd9B9?z\'>Aq\x10`&\x0e\xa1\xb2\x94\x0c}"QI\x82\xf5.O' \ b'\x9a:uu|\xdd\x86^\xfd\x0bu\xbf05\xea\\e\xc7\\\xbe\xd9' \ b'\x98\x0fFo9\xb1\n`\xe9\x8ccg\n\x13\xcb\x1b!\xb2\xcdt|' \ b'\xc7!\xfawn3\xf0p\xb1n\xb6^\xe1;S\xa0\xf3y.\x8e\x83{' \ b'\x9f\x03\xa1\xfe\x8b\xae\xd4\xfa\xafh\xefP\x8c\xa0\xc1' \ b'\x8dWW\x85\xa0\xfeT\xa8\xa3\xe1\x85\x11G\x0f5\x83\xec' \ b'\xebvJ\x1a(\xbdk\x8c\xbbf\x81\x1d\xc0\x91[\x1c\x9d\xa4' \ b'\x0c\x81\xfe\x94-\xd9\xa0\xd3\x0c\xe0~\r\x8eZ\xc91>\xac' \ b'\x935\x94H\xfeN\x02\t\xe5\xb15X3\xcb3n\xec\x82\xbcl\x05' \ b'\xa7\x07X\xc6\x1a`\x1b\xd3\x85\x0c<c\x81K$\xb9#\x12h' \ b'\xa9gN\xce\x8f:\x0e\xe1r\xf2K\xc1\x05\xa5J6\x12\xf8\xd7' \ b'\xce\xcb@\xea\xb3\x0c]\x89\xe3\x9b)\xcd\x11\x06\x9bH4\n' \ b'\xad\xbd\xdb\x80U\r\x9e\xf6h$;Gov\xb3\x03\x88a\x81.MA' \ b'\x99\xc2\xc2Q\x1c=3c#)\xfb\xc1\x10f<xI\xef\xb2\xdcP' \ b'\xd9P\x1d\xc68\xec#-\xbd\xf2\x8c\x16a\xaa\x1a\xb6qb\x15' \ b'\xa8\xcct\xb8e\xc9\xbb\xd6S\x01 U\xcfw\xbd\xc0\xab\xb3l' \ b'\x1d\xd2\xa6k\x04\x06G_\x0e\x9bjam\xb4\xc4-\xcf\xad\x07c' \ b'\xf9"N\x8c\xe3r.\x0cq\xe2\x8c\x99\xd5\xa9\xfc\xbevRW7' \ b'\x17y\xfd\xbf\x9bq\t\x92\x1d\xc9\x19E\xd5\xedJ\xea9\xa4' \ b'\xd26~\xcc\x12\x9b\x12\xc4\x96(\xbe\xd7\x05-\xc9\x9f\x02' \ b'\xe2\x08f\xaf\'J\x0c\xb1\xcd\xa6\x80k)s\xa8\xbe\x15\x9d' \ b'\r}P2\xa1u\r~T\xedq\xa1X3o\x0b\xcb\x9dN\x8dAME\xe9\xcb\n' \ b'\xc6 ,\n\xa3\xba\x9a\x15\xc5-\xbaW\x89y?\xe3\x16 T!\xf0' \ b'\xf5\xfd\xa3Ks3\xb7\xe9F#\xdd\xebQ\xa9+#\xf9WG\x05\x93' \ b'\x93\x9a\x127\xf7d\xf2\x1cx\x9a2\x0fB\xber*\xc4\x90\xf8' \ b'\x07\xd7#\xf4\xff\xc0\xdcF\xd7<d\xb0\xdb\xcf\xa1\x1e' \ b'\xd2\x98\xde\xd1=u\xa6\xc4\x81\xf0\x04#x\xb6\xde\x0e\xbe' \ b'\xc6\x1b:\x10\x8f\xdf\xa3\x99E\xa2\xc2W\xde\xa7\x03\xe6x' \ b'\xc3\x07\x9d\xf1\x01$\x1d\xa1L\xad\xe8bnI\x14\xe7\xc1,' text_data = 'BGwrYz3oUOoys8NJQN0Ju43r28l/bdXne8YbOZWiPMMoZFyxp9Qmc4NK6k' \ 'Bs/DA2ZougW3RVZGAs\n3RRPLU78oRpTH3jzSViqj0jEtpMIwpOofhDjyP' \ '8bM7/bHWIa9XruomgdnOxkttqMc/Mxj6ZcODlv\nGADtY86z+/VdfO9lDj' \ 'nwYmkkvjPN3qxpy6LIx9ZPMKpwCzTheidJR95u6gG+1ofA5HYaLIReujUn' \ '\ntvtZKu49pmiEuz5tT0VWRPHR/7q2Eg5u7SZAhlWtOW+G/P7QkLFButy8' \ 'sArJwCBtEl6DH7B+L570\nZxfBaF1yaFU7VmZNL3e6MIq2Lgkk6TU3Ezvy' \ 'LMB1ZLt8Zpst4tL814fMmJ6QazUaafG73YQkmoVg\nGdbemZBu3CLxJ3iX' \ 'i9NPZxDionF9yNAt7gdiGqrVC3lRJIgSF1wn5/jqsdv8OhBI98DWOOYGmv' \ 'EJ\nM+DztfOx4KQpA4TSunCRK/2H6POolGN1gOXbteUZY4cA2FreVW15QG' \ '/an30epRiKH/cgeNdEuIIe\niFsWt62tFTxXaQZZbc/p/hwUJ7iSMeYpq7' \ 'WgYmJQmkdHggKFFZniuI5VyE1YHqVu1bZEhLaI3XSJ\npGF9dvGRCamzGO' \ 'xLnz7TsjbVM45maSPXGJVw5OgZrZhqPdZNKgplblL8xvg//lRF582cYQFy' \ 'yM8X\nOGqN83/QKo02FwEdqGg6DD5zzbLys4K/HjYguARUHLMBziFCvq2x' \ '9z31pSJUUCaBVit0Z4S4cCiK\narptw/91PnBJCdchBk0T62Kt4E41ClWV' \ 'OUWZcLKWVhW689HLrvO4YCBi+qZDtTJFK1cmahAh9xZj\n1KmfvZzM6QFB' \ 'RTtH2qzvEsgiA6lu9u1HS8ohHFxEYDJ32XKoNSQtarfOpjw/sA3kUaBi5a' \ '1Josah\nXDyGoXSXdtVq2wdZLLf7uuwbTUZae6j+bl5R7dYTkKzhsaVmpU' \ 'zkrCHjl7XB+9YfpNwiCYPIfZSQ\nNluAEf2OeGozMipZ47fh9PMvWHri3g' \ '8pA/7B9Nn8K3mSmEDLBBZgkcKynR6rtSgzj2hIX0qS0/iX\nihk5ZjvZiu' \ 'tqPiix6j+SSl59jk2WERh1IVHHWtBJUknbTlV3reTL+aWZHfkUioA0RSRi' \ 'cwBTY6ou\nnypnq8l4mPTWUCZReDz7N5OEGWquroD8Fv4+IB5EviVI6Xrj' \ 'Yil8m0rIjtbmwgFK0kSvkTEUI0DD\nCH3TY/+tXgLWA6scXG46T9+deuM0' \ 'F7H/+4iRfnLV1LMV8J+roIFcg3VPX1yBW4wryXNdERVNhbTk\nI/9c17pC' \ '8fWqhv8kLBvcZcbzn6XDkKWXcQ6VOwiopYw/b6HaPDR7zSeBhNoPPJEw5q' \ 'q6ZSs2eA==\n' binary_handle = NamedTemporaryFile(mode='wb+') binary_handle.write(binary_data) binary_handle.seek(0) binary_guesser(binary_handle) assert binary_handle.tell() == 0 text_handle = NamedTemporaryFile(mode='wt+') text_handle.write(text_data) text_handle.seek(256) try: binary_guesser(text_handle, num_bytes=128) except FormatError as error: assert error.message == '{0} is probably not a binary ' \ 'file'.format(text_handle.name) assert text_handle.tell() == 256
def commit_file(srcfile, user_id, extension): if USE_COMPRESSION: extension += '.gz' # We need to copy the file contents to the original location compfile = NamedTemporaryFile(prefix='twitter-', dir=TEMPORARY_DIRECTORY, delete=False) with profiled("Compressing output in %s"): with compressor(compfile) as gzfile: srcfile.seek(0) shutil.copyfileobj(srcfile, gzfile) log.msg("Output file size is %d bytes (%d bytes compressed)" % (gzfile.tell(), compfile.tell())) srcfile.close() # Delete the old plain file compfile.close() commit_file_compressed(compfile, user_id, extension)
class CallbackFileWrapper(object): """ Small wrapper around a fp object which will tee everything read into a buffer, and when that file is closed it will execute a callback with the contents of that buffer. All attributes are proxied to the underlying file object. This class uses members with a double underscore (__) leading prefix so as not to accidentally shadow an attribute. The data is stored in a temporary file until it is all available. As long as the temporary files directory is disk-based (sometimes it's a memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory pressure is high. For small files the disk usually won't be used at all, it'll all be in the filesystem memory cache, so there should be no performance impact. """ def __init__(self, fp, callback): self.__buf = NamedTemporaryFile("rb+", delete=True) self.__fp = fp self.__callback = callback def __getattr__(self, name): # The vaguaries of garbage collection means that self.__fp is # not always set. By using __getattribute__ and the private # name[0] allows looking up the attribute value and raising an # AttributeError when it doesn't exist. This stop thigns from # infinitely recursing calls to getattr in the case where # self.__fp hasn't been set. # # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers fp = self.__getattribute__("_CallbackFileWrapper__fp") return getattr(fp, name) def __is_fp_closed(self): try: return self.__fp.fp is None except AttributeError: pass try: return self.__fp.closed except AttributeError: pass # We just don't cache it then. # TODO: Add some logging here... return False def _close(self): if self.__callback: if self.__buf.tell() == 0: # Empty file: result = b"" else: # Return the data without actually loading it into memory, # relying on Python's buffer API and mmap(). mmap() just gives # a view directly into the filesystem's memory cache, so it # doesn't result in duplicate memory use. self.__buf.seek(0, 0) result = memoryview( mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ)) self.__callback(result) # We assign this to None here, because otherwise we can get into # really tricky problems where the CPython interpreter dead locks # because the callback is holding a reference to something which # has a __del__ method. Setting this to None breaks the cycle # and allows the garbage collector to do it's thing normally. self.__callback = None # Closing the temporary file releases memory and frees disk space. # Important when caching big files. self.__buf.close() def read(self, amt=None): data = self.__fp.read(amt) if data: # We may be dealing with b'', a sign that things are over: # it's passed e.g. after we've already closed self.__buf. self.__buf.write(data) if self.__is_fp_closed(): self._close() return data def _safe_read(self, amt): data = self.__fp._safe_read(amt) if amt == 2 and data == b"\r\n": # urllib executes this read to toss the CRLF at the end # of the chunk. return data self.__buf.write(data) if self.__is_fp_closed(): self._close() return data
class FileStorage(Storage): """ Instance attributes: fp : file index : { oid:string : offset:int } Gives the offset of the current version of each oid. pending_records : { oid:str : record:str } Object records are accumulated here during a commit. pack_extra : [oid:str] | None oids of objects that have been committed after the pack began. It is None if a pack is not in progress. """ _PACK_INCREMENT = 20 # number of records to pack before yielding def __init__(self, filename=None, readonly=False, repair=False): """(filename:str=None, readonly:bool=False, repair:bool=False) If filename is empty (or None), a temporary file will be used. """ self.oid = 0 self.filename = filename if readonly: if not filename: raise ValueError( "A filename is required for a readonly storage.") if repair: raise ValueError("A readonly storage can't be repaired.") self.fp = open(self.filename, 'rb') else: if not filename: self.fp = NamedTemporaryFile(suffix=".durus", mode="w+b") elif os.path.exists(self.filename): self.fp = open(self.filename, 'a+b') else: self.fp = open(self.filename, 'w+b') try: lock_file(self.fp) except IOError: self.fp.close() raise RuntimeError( "\n %s is locked." "\n There is probably a Durus storage server (or a client)" "\n using it.\n" % self.get_filename()) self.pending_records = {} self.pack_extra = None self.repair = repair self._set_concrete_class_for_magic() self.index = {} self._build_index() max_oid = 0 for oid in self.index: max_oid = max(max_oid, u64(oid)) self.oid = max_oid def _set_concrete_class_for_magic(self): """ FileStorage is an abstract class. The constructor calls this to set self.__class__ to a subclass that matches the format of the underlying file. If the underlying file is empty, this writes the magic string into the file. """ if self.__class__ is FileStorage: for format in (FileStorage1, FileStorage2): self.fp.seek(0) self.__class__ = format if format.MAGIC == self.fp.read(len(format.MAGIC)): return # Write header for new FileStorage2 file. self.fp.seek(0, 2) if self.fp.tell() != 0: raise IOError, "%r has no FileStorage magic" % self.fp self._write_header(self.fp) self._write_index(self.fp, {}) def _write_header(self, fp): fp.seek(0, 2) assert fp.tell() == 0 fp.write(self.MAGIC) def _write_index(self, fp, index): pass def get_size(self): return len(self.index) def new_oid(self): self.oid += 1 return p64(self.oid) def load(self, oid): if self.fp is None: raise IOError, 'storage is closed' offset = self.index[oid] self.fp.seek(offset) return self._read_block() def begin(self): pass def store(self, oid, record): """Add a record during a commit.""" self.pending_records[oid] = record def _generate_pending_records(self): for oid, record in self.pending_records.iteritems(): yield oid, record def end(self, handle_invalidations=None): """Complete a commit. """ if self.fp is None: raise IOError, 'storage is closed' index = {} for z in self._write_transaction( self.fp, self._generate_pending_records(), index): pass self.fp.flush() fsync(self.fp) self.index.update(index) if self.pack_extra is not None: self.pack_extra.extend(index) self.pending_records.clear() def sync(self): """ A FileStorage is the storage of one StorageServer or one Connection, so there can never be any invalidations to transfer. """ return [] def get_filename(self): """() -> str The name of the file. If a tempfile is being used, the name will change when it is packed. """ return self.filename or self.fp.name def _write_transaction(self, fp, records, index): fp.seek(0, 2) for i, (oid, record) in enumerate(records): full_record = self._disk_format(record) index[oid] = fp.tell() fp.write(p32(len(full_record))) fp.write(full_record) if i % self._PACK_INCREMENT == 0: yield None fp.write(p32(0)) # terminator def _disk_format(self, record): return record def _packer(self): if self.filename: prepack_name = self.filename + '.prepack' pack_name = self.filename + '.pack' packed = open(pack_name, 'w+b') else: packed = NamedTemporaryFile(suffix=".durus", mode="w+b") lock_file(packed) self._write_header(packed) def gen_reachable_records(): todo = [ROOT_OID] seen = Set() while todo: oid = todo.pop() if oid in seen: continue seen.add(oid) record = self.load(oid) record_oid, data, refdata = unpack_record(record) assert oid == record_oid todo.extend(split_oids(refdata)) yield oid, record while self.pack_extra: oid = self.pack_extra.pop() yield oid, self.load(oid) index = {} for z in self._write_transaction( packed, gen_reachable_records(), index): yield None self._write_index(packed, index) packed.flush() fsync(packed) if self.filename: if not RENAME_OPEN_FILE: unlock_file(packed) packed.close() unlock_file(self.fp) self.fp.close() if os.path.exists(prepack_name): # for Win32 os.unlink(prepack_name) os.rename(self.filename, prepack_name) os.rename(pack_name, self.filename) if RENAME_OPEN_FILE: self.fp = packed else: self.fp = open(self.filename, 'r+b') lock_file(self.fp) else: # tempfile unlock_file(self.fp) self.fp.close() self.fp = packed self.index = index self.pack_extra = None def get_packer(self): """Return an incremental packer (a generator). Each time next() is called, up to _PACK_INCREMENT records will be packed. Note that the generator must be exhaused before calling get_packer() again. """ if self.fp is None: raise IOError, 'storage is closed' if self.fp.mode == 'rb': raise IOError, "read-only storage" assert not self.pending_records assert self.pack_extra is None self.pack_extra = [] return self._packer() def pack(self): for z in self.get_packer(): pass def gen_oid_record(self): """() -> sequence([(oid:str, record:str)]) Generate oid, record pairs, for all oids in the database. Note that this may include oids that are not reachable from the root object. """ for oid in self.index: yield oid, self.load(oid) def close(self): if self.fp is not None: unlock_file(self.fp) self.fp.close() self.fp = None def _read_block(self): size_str = self.fp.read(4) if len(size_str) == 0: raise IOError, "eof" size = u32(size_str) if size == 0: return '' result = self.fp.read(size) if len(result) != size: raise IOError, "short read" return result