def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'): """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor""" #signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better CHUNK_SIZE = 1048576 data_checked = False is_compressed = False is_binary = False is_multi_byte = False if not target_encoding or not encodings_search_function(target_encoding): target_encoding = util.DEFAULT_ENCODING #utf-8 if not source_encoding: source_encoding = util.DEFAULT_ENCODING #sys.getdefaultencoding() would mimic old behavior (defaults to ascii) while 1: chunk = stream.read(CHUNK_SIZE) if not chunk: break if not data_checked: # See if we're uploading a compressed file if zipfile.is_zipfile(filename): is_compressed = True else: try: if unicode(chunk[:2]) == unicode(util.gzip_magic): is_compressed = True except: pass if not is_compressed: # See if we have a multi-byte character file chars = chunk[:100] is_multi_byte = util.is_multi_byte(chars) if not is_multi_byte: is_binary = util.is_binary(chunk) data_checked = True if not is_compressed and not is_binary: if not isinstance(chunk, unicode): chunk = chunk.decode(source_encoding, source_error) os.write(fd, chunk.encode(target_encoding, target_error)) else: # Compressed files must be encoded after they are uncompressed in the upload utility, # while binary files should not be encoded at all. os.write(fd, chunk) os.close(fd) return filename, is_multi_byte
def stream_to_open_named_file( stream, fd, filename, source_encoding=None, source_error="strict", target_encoding=None, target_error="strict" ): """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor""" # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better CHUNK_SIZE = 1048576 data_checked = False is_compressed = False is_binary = False is_multi_byte = False if not target_encoding or not encodings_search_function(target_encoding): target_encoding = util.DEFAULT_ENCODING # utf-8 if not source_encoding: source_encoding = util.DEFAULT_ENCODING # sys.getdefaultencoding() would mimic old behavior (defaults to ascii) while 1: chunk = stream.read(CHUNK_SIZE) if not chunk: break if not data_checked: # See if we're uploading a compressed file if zipfile.is_zipfile(filename): is_compressed = True else: try: if unicode(chunk[:2]) == unicode(util.gzip_magic): is_compressed = True except: pass if not is_compressed: # See if we have a multi-byte character file chars = chunk[:100] is_multi_byte = util.is_multi_byte(chars) if not is_multi_byte: is_binary = util.is_binary(chunk) data_checked = True if not is_compressed and not is_binary: if not isinstance(chunk, unicode): chunk = chunk.decode(source_encoding, source_error) os.write(fd, chunk.encode(target_encoding, target_error)) else: # Compressed files must be encoded after they are uncompressed in the upload utility, # while binary files should not be encoded at all. os.write(fd, chunk) os.close(fd) return filename, is_multi_byte
def stream_to_file(stream, suffix='', prefix='', dir=None, text=False): """Writes a stream to a temporary file, returns the temporary file's name""" fd, temp_name = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=dir, text=text) CHUNK_SIZE = 1048576 data_checked = False is_compressed = False is_binary = False is_multi_byte = False while 1: chunk = stream.read(CHUNK_SIZE) if not chunk: break if not data_checked: # See if we're uploading a compressed file if zipfile.is_zipfile(temp_name): is_compressed = True else: try: if unicode(chunk[:2]) == unicode(util.gzip_magic): is_compressed = True except: pass if not is_compressed: # See if we have a multi-byte character file chars = chunk[:100] is_multi_byte = util.is_multi_byte(chars) if not is_multi_byte: for char in chars: if ord(char) > 128: is_binary = True break data_checked = True if not is_compressed and not is_binary: os.write(fd, chunk.encode("utf-8")) else: # Compressed files must be encoded after they are uncompressed in the upload utility, # while binary files should not be encoded at all. os.write(fd, chunk) os.close(fd) return temp_name, is_multi_byte
def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ): """Writes a stream to a temporary file, returns the temporary file's name""" fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text ) CHUNK_SIZE = 1048576 data_checked = False is_compressed = False is_binary = False is_multi_byte = False while 1: chunk = stream.read( CHUNK_SIZE ) if not chunk: break if not data_checked: # See if we're uploading a compressed file if zipfile.is_zipfile( temp_name ): is_compressed = True else: try: if unicode( chunk[:2] ) == unicode( util.gzip_magic ): is_compressed = True except: pass if not is_compressed: # See if we have a multi-byte character file chars = chunk[:100] is_multi_byte = util.is_multi_byte( chars ) if not is_multi_byte: for char in chars: if ord( char ) > 128: is_binary = True break data_checked = True if not is_compressed and not is_binary: os.write( fd, chunk.encode( "utf-8" ) ) else: # Compressed files must be encoded after they are uncompressed in the upload utility, # while binary files should not be encoded at all. os.write( fd, chunk ) os.close( fd ) return temp_name, is_multi_byte
def stream_to_open_named_file( stream, fd, filename ): """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor""" #signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better CHUNK_SIZE = 1048576 data_checked = False is_compressed = False is_binary = False is_multi_byte = False while 1: chunk = stream.read( CHUNK_SIZE ) if not chunk: break if not data_checked: # See if we're uploading a compressed file if zipfile.is_zipfile( filename ): is_compressed = True else: try: if unicode( chunk[:2] ) == unicode( util.gzip_magic ): is_compressed = True except: pass if not is_compressed: # See if we have a multi-byte character file chars = chunk[:100] is_multi_byte = util.is_multi_byte( chars ) if not is_multi_byte: for char in chars: if ord( char ) > 128: is_binary = True break data_checked = True if not is_compressed and not is_binary: os.write( fd, chunk.encode( "utf-8" ) ) else: # Compressed files must be encoded after they are uncompressed in the upload utility, # while binary files should not be encoded at all. os.write( fd, chunk ) os.close( fd ) return filename, is_multi_byte
def check_binary( self, temp_name, chunk=None ): if chunk is None: temp = open( temp_name, "U" ) else: temp = chunk lineno = 0 for line in temp: lineno += 1 line = line.strip() if line: if util.is_multi_byte( line ): return False for char in line: if ord( char ) > 128: if chunk is None: temp.close() return True if lineno > 10: break if chunk is None: temp.close() return False
file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = util.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError, e: dataset.is_multi_byte = False # Is dataset an image? image = check_image(dataset.path) if image: if not PIL: image = None # get_image_ext() returns None if nor a supported Image type ext = get_image_ext(dataset.path, image) data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, is_multi_byte=True) # Is dataset content supported sniffable binary?
temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return dataset.path = temp_name # See if we have an empty file if not os.path.exists( dataset.path ): file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file ) return if not os.path.getsize( dataset.path ) > 0: file_err( 'The uploaded file is empty', dataset, json_file ) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) ) except UnicodeDecodeError, e: dataset.is_multi_byte = False # Is dataset an image? image = check_image( dataset.path ) if image: if not PIL: image = None # get_image_ext() returns None if nor a supported Image type ext = get_image_ext( dataset.path, image ) data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) # Is dataset content supported sniffable binary?
temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return dataset.path = temp_name dataset.is_multi_byte = is_multi_byte # See if we have an empty file if not os.path.exists( dataset.path ): file_err( 'Uploaded temporary file (%s) does not exist. Please' % dataset.path, dataset, json_file ) return if not os.path.getsize( dataset.path ) > 0: file_err( 'The uploaded file is empty', dataset, json_file ) return if 'is_multi_byte' not in dir( dataset ): dataset.is_multi_byte = util.is_multi_byte( open( dataset.path, 'r' ).read( 1024 ) ) if dataset.is_multi_byte: ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) data_type = ext else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip( dataset.path ) if is_gzipped and not is_valid: file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) return elif is_gzipped and is_valid: # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ) ) gzipped_file = gzip.GzipFile( dataset.path ) while 1:
) except Exception, e: file_err("Unable to fetch %s\n%s" % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err("Uploaded temporary file (%s) does not exist." % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err("The uploaded file is empty", dataset, json_file) return if not dataset.type == "url": # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = util.is_multi_byte(codecs.open(dataset.path, "r", "utf-8").read(100)) except UnicodeDecodeError, e: dataset.is_multi_byte = False # Is dataset an image? image = check_image(dataset.path) if image: if not PIL: image = None # get_image_ext() returns None if nor a supported Image type ext = get_image_ext(dataset.path, image) data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = "multi-byte char" ext = sniff.guess_ext(dataset.path, is_multi_byte=True) # Is dataset content supported sniffable binary?