Esempio n. 1
0
def stream_to_open_named_file(stream,
                              fd,
                              filename,
                              source_encoding=None,
                              source_error='strict',
                              target_encoding=None,
                              target_error='strict'):
    """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor"""
    #signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    is_multi_byte = False
    if not target_encoding or not encodings_search_function(target_encoding):
        target_encoding = util.DEFAULT_ENCODING  #utf-8
    if not source_encoding:
        source_encoding = util.DEFAULT_ENCODING  #sys.getdefaultencoding() would mimic old behavior (defaults to ascii)
    while 1:
        chunk = stream.read(CHUNK_SIZE)
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            if zipfile.is_zipfile(filename):
                is_compressed = True
            else:
                try:
                    if unicode(chunk[:2]) == unicode(util.gzip_magic):
                        is_compressed = True
                except:
                    pass
            if not is_compressed:
                # See if we have a multi-byte character file
                chars = chunk[:100]
                is_multi_byte = util.is_multi_byte(chars)
                if not is_multi_byte:
                    is_binary = util.is_binary(chunk)
            data_checked = True
        if not is_compressed and not is_binary:
            if not isinstance(chunk, unicode):
                chunk = chunk.decode(source_encoding, source_error)
            os.write(fd, chunk.encode(target_encoding, target_error))
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write(fd, chunk)
    os.close(fd)
    return filename, is_multi_byte
Esempio n. 2
0
def stream_to_open_named_file(
    stream, fd, filename, source_encoding=None, source_error="strict", target_encoding=None, target_error="strict"
):
    """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor"""
    # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    is_multi_byte = False
    if not target_encoding or not encodings_search_function(target_encoding):
        target_encoding = util.DEFAULT_ENCODING  # utf-8
    if not source_encoding:
        source_encoding = util.DEFAULT_ENCODING  # sys.getdefaultencoding() would mimic old behavior (defaults to ascii)
    while 1:
        chunk = stream.read(CHUNK_SIZE)
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            if zipfile.is_zipfile(filename):
                is_compressed = True
            else:
                try:
                    if unicode(chunk[:2]) == unicode(util.gzip_magic):
                        is_compressed = True
                except:
                    pass
            if not is_compressed:
                # See if we have a multi-byte character file
                chars = chunk[:100]
                is_multi_byte = util.is_multi_byte(chars)
                if not is_multi_byte:
                    is_binary = util.is_binary(chunk)
            data_checked = True
        if not is_compressed and not is_binary:
            if not isinstance(chunk, unicode):
                chunk = chunk.decode(source_encoding, source_error)
            os.write(fd, chunk.encode(target_encoding, target_error))
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write(fd, chunk)
    os.close(fd)
    return filename, is_multi_byte
Esempio n. 3
0
def stream_to_file(stream, suffix='', prefix='', dir=None, text=False):
    """Writes a stream to a temporary file, returns the temporary file's name"""
    fd, temp_name = tempfile.mkstemp(suffix=suffix,
                                     prefix=prefix,
                                     dir=dir,
                                     text=text)
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    is_multi_byte = False
    while 1:
        chunk = stream.read(CHUNK_SIZE)
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            if zipfile.is_zipfile(temp_name):
                is_compressed = True
            else:
                try:
                    if unicode(chunk[:2]) == unicode(util.gzip_magic):
                        is_compressed = True
                except:
                    pass
            if not is_compressed:
                # See if we have a multi-byte character file
                chars = chunk[:100]
                is_multi_byte = util.is_multi_byte(chars)
                if not is_multi_byte:
                    for char in chars:
                        if ord(char) > 128:
                            is_binary = True
                            break
            data_checked = True
        if not is_compressed and not is_binary:
            os.write(fd, chunk.encode("utf-8"))
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write(fd, chunk)
    os.close(fd)
    return temp_name, is_multi_byte
Esempio n. 4
0
def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ):
    """Writes a stream to a temporary file, returns the temporary file's name"""
    fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text )
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    is_multi_byte = False
    while 1:
        chunk = stream.read( CHUNK_SIZE )
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            if zipfile.is_zipfile( temp_name ):
                is_compressed = True
            else:
                try:
                    if unicode( chunk[:2] ) == unicode( util.gzip_magic ):
                        is_compressed = True
                except:
                    pass
            if not is_compressed:
                # See if we have a multi-byte character file
                chars = chunk[:100]
                is_multi_byte = util.is_multi_byte( chars )
                if not is_multi_byte:
                    for char in chars:
                        if ord( char ) > 128:
                            is_binary = True
                            break
            data_checked = True
        if not is_compressed and not is_binary:
            os.write( fd, chunk.encode( "utf-8" ) )
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write( fd, chunk )
    os.close( fd )
    return temp_name, is_multi_byte
Esempio n. 5
0
def stream_to_open_named_file( stream, fd, filename ):
    """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor"""
    #signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    is_multi_byte = False
    while 1:
        chunk = stream.read( CHUNK_SIZE )
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            if zipfile.is_zipfile( filename ):
                is_compressed = True
            else:
                try:
                    if unicode( chunk[:2] ) == unicode( util.gzip_magic ):
                        is_compressed = True
                except:
                    pass
            if not is_compressed:
                # See if we have a multi-byte character file
                chars = chunk[:100]
                is_multi_byte = util.is_multi_byte( chars )
                if not is_multi_byte:
                    for char in chars:
                        if ord( char ) > 128:
                            is_binary = True
                            break
            data_checked = True
        if not is_compressed and not is_binary:
            os.write( fd, chunk.encode( "utf-8" ) )
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write( fd, chunk )
    os.close( fd )
    return filename, is_multi_byte
Esempio n. 6
0
 def check_binary( self, temp_name, chunk=None ):
     if chunk is None:
         temp = open( temp_name, "U" )
     else:
         temp = chunk
     lineno = 0
     for line in temp:
         lineno += 1
         line = line.strip()
         if line:
             if util.is_multi_byte( line ):
                 return False
             for char in line:
                 if ord( char ) > 128:
                     if chunk is None:
                         temp.close()
                     return True
         if lineno > 10:
             break
     if chunk is None:
         temp.close()
     return False
Esempio n. 7
0
         file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)),
                  dataset, json_file)
         return
     dataset.path = temp_name
 # See if we have an empty file
 if not os.path.exists(dataset.path):
     file_err('Uploaded temporary file (%s) does not exist.' % dataset.path,
              dataset, json_file)
     return
 if not os.path.getsize(dataset.path) > 0:
     file_err('The uploaded file is empty', dataset, json_file)
     return
 if not dataset.type == 'url':
     # Already set is_multi_byte above if type == 'url'
     try:
         dataset.is_multi_byte = util.is_multi_byte(
             codecs.open(dataset.path, 'r', 'utf-8').read(100))
     except UnicodeDecodeError, e:
         dataset.is_multi_byte = False
 # Is dataset an image?
 image = check_image(dataset.path)
 if image:
     if not PIL:
         image = None
     # get_image_ext() returns None if nor a supported Image type
     ext = get_image_ext(dataset.path, image)
     data_type = ext
 # Is dataset content multi-byte?
 elif dataset.is_multi_byte:
     data_type = 'multi-byte char'
     ext = sniff.guess_ext(dataset.path, is_multi_byte=True)
 # Is dataset content supported sniffable binary?
Esempio n. 8
0
         temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) )
     except Exception, e:
         file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file )
         return
     dataset.path = temp_name
 # See if we have an empty file
 if not os.path.exists( dataset.path ):
     file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file )
     return
 if not os.path.getsize( dataset.path ) > 0:
     file_err( 'The uploaded file is empty', dataset, json_file )
     return
 if not dataset.type == 'url':
     # Already set is_multi_byte above if type == 'url'
     try:
         dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) )
     except UnicodeDecodeError, e:
         dataset.is_multi_byte = False
 # Is dataset an image?
 image = check_image( dataset.path )
 if image:
     if not PIL:
         image = None
     # get_image_ext() returns None if nor a supported Image type
     ext = get_image_ext( dataset.path, image )
     data_type = ext
 # Is dataset content multi-byte?
 elif dataset.is_multi_byte:
     data_type = 'multi-byte char'
     ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
 # Is dataset content supported sniffable binary?
Esempio n. 9
0
            temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' )
        except Exception, e:
            file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file )
            return
        dataset.path = temp_name
        dataset.is_multi_byte = is_multi_byte

    # See if we have an empty file
    if not os.path.exists( dataset.path ):
        file_err( 'Uploaded temporary file (%s) does not exist.  Please' % dataset.path, dataset, json_file )
        return
    if not os.path.getsize( dataset.path ) > 0:
        file_err( 'The uploaded file is empty', dataset, json_file )
        return
    if 'is_multi_byte' not in dir( dataset ):
        dataset.is_multi_byte = util.is_multi_byte( open( dataset.path, 'r' ).read( 1024 ) )
    if dataset.is_multi_byte:
        ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
        data_type = ext
    else:
        # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
        is_gzipped, is_valid = check_gzip( dataset.path )
        if is_gzipped and not is_valid:
            file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
            return
        elif is_gzipped and is_valid:
            # We need to uncompress the temp_name file
            CHUNK_SIZE = 2**20 # 1Mb   
            fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ) )
            gzipped_file = gzip.GzipFile( dataset.path )
            while 1:
Esempio n. 10
0
         )
     except Exception, e:
         file_err("Unable to fetch %s\n%s" % (dataset.path, str(e)), dataset, json_file)
         return
     dataset.path = temp_name
 # See if we have an empty file
 if not os.path.exists(dataset.path):
     file_err("Uploaded temporary file (%s) does not exist." % dataset.path, dataset, json_file)
     return
 if not os.path.getsize(dataset.path) > 0:
     file_err("The uploaded file is empty", dataset, json_file)
     return
 if not dataset.type == "url":
     # Already set is_multi_byte above if type == 'url'
     try:
         dataset.is_multi_byte = util.is_multi_byte(codecs.open(dataset.path, "r", "utf-8").read(100))
     except UnicodeDecodeError, e:
         dataset.is_multi_byte = False
 # Is dataset an image?
 image = check_image(dataset.path)
 if image:
     if not PIL:
         image = None
     # get_image_ext() returns None if nor a supported Image type
     ext = get_image_ext(dataset.path, image)
     data_type = ext
 # Is dataset content multi-byte?
 elif dataset.is_multi_byte:
     data_type = "multi-byte char"
     ext = sniff.guess_ext(dataset.path, is_multi_byte=True)
 # Is dataset content supported sniffable binary?