Esempio n. 1
0
 def __init__(self,
              name,
              fileobj,
              delimiter=None,
              quotechar=None,
              encoding='utf-8',
              window=None,
              doublequote=None,
              lineterminator=None,
              skipinitialspace=None):
     self.name = name
     seekable_fileobj = messytables.seekable_stream(fileobj)
     self.fileobj = UTF8Recoder(seekable_fileobj, encoding)
     self.lines = ilines(self.fileobj)
     self._sample = []
     self.delimiter = delimiter
     self.quotechar = quotechar
     self.window = window or 1000
     self.doublequote = doublequote
     self.lineterminator = lineterminator
     self.skipinitialspace = skipinitialspace
     try:
         for i in xrange(self.window):
             self._sample.append(self.lines.next())
     except StopIteration:
         pass
     super(CSVRowSet, self).__init__()
Esempio n. 2
0
 def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None):
     self.fileobj = messytables.seekable_stream(fileobj)
     self.name = name or 'table'
     self.delimiter = delimiter
     self.quotechar = quotechar
     self.encoding = encoding
     self.window = window
Esempio n. 3
0
 def __init__(
     self,
     name,
     fileobj,
     delimiter=None,
     quotechar=None,
     encoding="utf-8",
     window=None,
     doublequote=None,
     lineterminator=None,
     skipinitialspace=None,
 ):
     self.name = name
     seekable_fileobj = messytables.seekable_stream(fileobj)
     self.fileobj = UTF8Recoder(seekable_fileobj, encoding)
     self.lines = ilines(self.fileobj)
     self._sample = []
     self.delimiter = delimiter
     self.quotechar = quotechar
     self.window = window or 1000
     self.doublequote = doublequote
     self.lineterminator = lineterminator
     self.skipinitialspace = skipinitialspace
     try:
         for i in xrange(self.window):
             self._sample.append(self.lines.next())
     except StopIteration:
         pass
     super(CSVRowSet, self).__init__()
Esempio n. 4
0
 def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None):
     self.fileobj = messytables.seekable_stream(fileobj)
     self.name = name or 'table'
     self.delimiter = delimiter or ','
     self.quotechar = quotechar or '"'
     self.encoding = encoding
     self.window = window
Esempio n. 5
0
def any_tableset(fileobj, mimetype=None, extension=None):
    """Reads any supported table type according to a specified
    MIME type or file extension or automatically detecting the
    type.

    Best matching TableSet loaded with the fileobject is returned.
    Matching is done by looking at the type (e.g mimetype='text/csv')
    or file extension (e.g. extension='tsv'), or otherwise autodetecting
    the file format by using the magic library which looks at the first few
    bytes of the file BUT is often wrong. Consult the source for recognized
    MIME types and file extensions.

    On error it raises messytables.ReadError
    """
    # Auto-detect if the caller has offered no clue. (Because the
    # auto-detection routine is pretty poor.)
    if mimetype is None and extension is None:
        import magic
        # Since we need to peek the start of the stream, make sure we can
        # seek back later. If not, slurp in the contents into a StringIO.
        fileobj = messytables.seekable_stream(fileobj)
        header = fileobj.read(1024)
        mimetype = magic.from_buffer(header, mime=True)
        fileobj.seek(0)

    if (mimetype in ('application/x-zip-compressed', 'application/zip')
            or (extension and extension.lower() in ('zip',))):
        # Do this first because the extension applies to the content
        # type of the inner files, so don't check them before we check
        # for a ZIP file.
        return ZIPTableSet(fileobj)

    if (mimetype in ('text/csv', 'text/comma-separated-values') or
            (extension and extension.lower() in ('csv',))):
        return CSVTableSet(fileobj)  # guess delimiter
    if (mimetype in ('text/tsv', 'text/tab-separated-values') or
            (extension and extension.lower() in ('tsv',))):
        return CSVTableSet(fileobj, delimiter='\t')
    if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                    'application/xls') or (extension and extension.lower() in
                                           ('xls',)):
        return XLSTableSet(fileobj)
    if (mimetype in (
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',)
            or (extension and extension.lower() in ('xlsx',))):
        return XLSXTableSet(fileobj)
    if (mimetype in ('text/html',)
            or (extension and extension.lower() in ('htm', 'html',))):
        return HTMLTableSet(fileobj)
    if mimetype:
        raise ValueError("Unrecognized MIME type: {mimetype}".format(
            mimetype=mimetype))
    if extension:
        raise ValueError('''Could not determine MIME type and
         unrecognized extension: {extension}'''.format(extension=extension))
    raise ValueError("Could not determine MIME type and no extension given.")
Esempio n. 6
0
def get_mime(fileobj):
    import magic
    # Since we need to peek the start of the stream, make sure we can
    # seek back later. If not, slurp in the contents into a StringIO.
    fileobj = messytables.seekable_stream(fileobj)
    header = fileobj.read(4096)
    mimetype = magic.from_buffer(header, mime=True)
    fileobj.seek(0)
    # There's an issue with vnd.ms-excel being returned fro XLSX files, too.
    if mimetype == 'application/vnd.ms-excel' and header[:2] == 'PK':
        return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    return mimetype
Esempio n. 7
0
 def __init__(self, fileobj, delimiter=None, quotechar=None, name=None,
              encoding=None, window=None, doublequote=None,
              lineterminator=None, skipinitialspace=None):
     self.fileobj = messytables.seekable_stream(fileobj)
     self.name = name or 'table'
     self.delimiter = delimiter
     self.quotechar = quotechar
     self.encoding = encoding
     self.window = window
     self.doublequote = doublequote
     self.lineterminator = lineterminator
     self.skipinitialspace = skipinitialspace
Esempio n. 8
0
def get_mime(fileobj):
    import magic
    # Since we need to peek the start of the stream, make sure we can
    # seek back later. If not, slurp in the contents into a StringIO.
    fileobj = messytables.seekable_stream(fileobj)
    header = fileobj.read(4096)
    mimetype = magic.from_buffer(header, mime=True)
    fileobj.seek(0)
    # There's an issue with vnd.ms-excel being returned from XLSX files, too.
    if mimetype == 'application/vnd.ms-excel' and header[:2] == 'PK':
        return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    return mimetype
Esempio n. 9
0
 def __init__(self, fileobj, delimiter=None, quotechar=None, name=None,
              encoding=None, window=None, doublequote=None,
              lineterminator=None, skipinitialspace=None, **kw):
     self.fileobj = messytables.seekable_stream(fileobj)
     self.name = name or 'table'
     self.delimiter = delimiter
     self.quotechar = quotechar
     self.encoding = encoding
     self.window = window
     self.doublequote = doublequote
     self.lineterminator = lineterminator
     self.skipinitialspace = skipinitialspace
Esempio n. 10
0
def get_mime(fileobj):
    import magic
    # Since we need to peek the start of the stream, make sure we can
    # seek back later. If not, slurp in the contents into a StringIO.
    fileobj = messytables.seekable_stream(fileobj)
    header = fileobj.read(4096)
    mimetype = magic.from_buffer(header, mime=True).decode('utf-8')
    fileobj.seek(0)
    if MIMELOOKUP.get(mimetype) == 'ZIP':
        # consider whether it's an Microsoft Office document
        if b"[Content_Types].xml" in header:
            return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    # There's an issue with vnd.ms-excel being returned from XLSX files, too.
    if mimetype == 'application/vnd.ms-excel' and header[:2] == b'PK':
        return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    return mimetype
Esempio n. 11
0
 def __init__(self, name, fileobj, delimiter=None, quotechar=None,
              encoding='utf-8', window=None):
     self.name = name
     seekable_fileobj = messytables.seekable_stream(fileobj)
     self.fileobj = UTF8Recoder(seekable_fileobj, encoding)
     self.lines = ilines(self.fileobj)
     self._sample = []
     self.delimiter = delimiter or ','
     self.quotechar = quotechar or '"'
     self.window = window or 1000
     try:
         for i in xrange(self.window):
             self._sample.append(self.lines.next())
     except StopIteration:
         pass
     super(CSVRowSet, self).__init__()
Esempio n. 12
0
def get_mime(fileobj):
    import magic
    # Since we need to peek the start of the stream, make sure we can
    # seek back later. If not, slurp in the contents into a StringIO.
    fileobj = messytables.seekable_stream(fileobj)
    header = fileobj.read(4096)
    mimetype = magic.from_buffer(header, mime=True).decode('utf-8')
    fileobj.seek(0)
    if MIMELOOKUP.get(mimetype) == 'ZIP':
        # consider whether it's an Microsoft Office document
        if b"[Content_Types].xml" in header:
            return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    # There's an issue with vnd.ms-excel being returned from XLSX files, too.
    if mimetype == 'application/vnd.ms-excel' and header[:2] == b'PK':
        return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    return mimetype
Esempio n. 13
0
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj)  # guess delimiter
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
Esempio n. 14
0
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter=',')
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls', 'application/excel') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")