Example #1
0
    def _load_from_file(self):
        """Load content from a file

        :params str filename: an accessible file path
        :returns: a book
        """
        self.__line_terminator = self._keywords.get(
            constants.KEYWORD_LINE_TERMINATOR, self.__line_terminator)
        names = self._file_name.split('.')
        filepattern = "%s%s*%s*.%s" % (names[0], DEFAULT_SEPARATOR,
                                       DEFAULT_SEPARATOR, names[1])
        filelist = glob.glob(filepattern)
        if len(filelist) == 0:
            file_parts = os.path.split(self._file_name)
            return [NamedContent(file_parts[-1], self._file_name)]
        else:
            matcher = "%s%s(.*)%s(.*).%s" % (names[0], DEFAULT_SEPARATOR,
                                             DEFAULT_SEPARATOR, names[1])
            tmp_file_list = []
            for filen in filelist:
                result = re.match(matcher, filen)
                tmp_file_list.append((result.group(1), result.group(2), filen))
            ret = []
            for lsheetname, index, filen in sorted(tmp_file_list,
                                                   key=lambda row: row[1]):
                ret.append(NamedContent(lsheetname, filen))
            return ret
Example #2
0
    def _load_from_stream(self):
        """Load content from memory

        :params stream file_content: the actual file content in memory
        :returns: a book
        """
        self.__load_from_memory_flag = True
        self.__line_terminator = self._keywords.get(
            constants.KEYWORD_LINE_TERMINATOR, self.__line_terminator
        )
        separator = DEFAULT_SHEET_SEPARATOR_FORMATTER % self.__line_terminator
        if self.__multiple_sheets:
            # will be slow for large files
            self._file_stream.seek(0)
            content = self._file_stream.read()
            sheets = content.split(separator)
            named_contents = []
            for sheet in sheets:
                if sheet == "":  # skip empty named sheet
                    continue

                lines = sheet.split(self.__line_terminator)
                result = re.match(constants.SEPARATOR_MATCHER, lines[0])
                new_content = "\n".join(lines[1:])
                new_sheet = NamedContent(
                    result.group(1), compact.StringIO(new_content)
                )
                named_contents.append(new_sheet)
            return named_contents

        else:
            if hasattr(self._file_stream, "seek"):
                self._file_stream.seek(0)
            return [NamedContent(self._file_type, self._file_stream)]
Example #3
0
    def read_sheet(self, index):
        name = self.content_array[index].name
        content = self.zipfile.read(self.content_array[index].payload)
        encoding_guess = chardet.detect(content)
        sheet = StringIO(content.decode(encoding_guess["encoding"]))

        return CSVinMemoryReader(NamedContent(name, sheet), **self.keywords)
Example #4
0
 def test_writer(self):
     native_sheet = NamedContent("test", [])
     content = [[1, 2], [3, 4], [5, 6]]
     writer = ArrayWriter(None, native_sheet, "test")
     writer.write_row(content[0])
     writer.write_array(content[1:])
     assert native_sheet.payload == content
Example #5
0
def test_utf16_decoding():
    test_file = os.path.join("tests", "fixtures", "csv-encoding-utf16.csv")
    reader = CSVFileReader(NamedContent("csv", test_file), encoding="utf-16")
    content = list(reader.to_array())
    if PY2:
        content[0] = [s.encode("utf-8") for s in content[0]]
    expected = [["Äkkilähdöt", "Matkakirjoituksia", "Matkatoimistot"]]
    eq_(content, expected)
Example #6
0
 def test_sheet_memory_reader_delimiters(self):
     io = manager.get_io(self.file_type)
     with open(self.test_file, "r") as f:
         io.write(f.read())
     io.seek(0)
     r = CSVinMemoryReader(NamedContent(self.file_type, io), delimiters=None)
     result = list(r.to_array())
     self.assertEqual(result, [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
Example #7
0
 def test_sheet_memory_reader(self):
     io = manager.get_io(self.file_type)
     with open(self.test_file, "r") as f:
         io.write(f.read())
     io.seek(0)
     r = CSVinMemoryReader(NamedContent(self.file_type, io))
     result = list(r.to_array())
     self.assertEqual(result, self.expected_data)
Example #8
0
def test_utf16_memory_decoding():
    test_content = u"Äkkilähdöt,Matkakirjoituksia,Matkatoimistot"
    test_content = BytesIO(test_content.encode("utf-16"))
    reader = CSVinMemoryReader(NamedContent("csv", test_content),
                               encoding="utf-16")
    content = list(reader.to_array())
    if PY2:
        content[0] = [s.encode("utf-8") for s in content[0]]
    expected = [["Äkkilähdöt", "Matkakirjoituksia", "Matkatoimistot"]]
    eq_(content, expected)
Example #9
0
def test_utf16_memory_decoding():
    test_content = u'Äkkilähdöt,Matkakirjoituksia,Matkatoimistot'
    test_content = BytesIO(test_content.encode('utf-16'))
    reader = CSVinMemoryReader(NamedContent('csv', test_content),
                               encoding="utf-16")
    content = list(reader.to_array())
    if PY2:
        content[0] = [s.encode('utf-8') for s in content[0]]
    expected = [['Äkkilähdöt', 'Matkakirjoituksia', 'Matkatoimistot']]
    eq_(content, expected)
Example #10
0
    def __init__(self,
                 file_stream,
                 file_type,
                 multiple_sheets=False,
                 **keywords):
        """Load content from memory
        :params stream file_content: the actual file content in memory
        :returns: a book
        """
        self.handles = []
        self.keywords = keywords
        if file_type == constants.FILE_FORMAT_TSV:
            self.keywords["dialect"] = constants.KEYWORD_TSV_DIALECT
        self.file_type = file_type

        self.__load_from_memory_flag = True
        self.__line_terminator = keywords.get(
            constants.KEYWORD_LINE_TERMINATOR, constants.DEFAULT_CSV_NEWLINE)
        separator = DEFAULT_SHEET_SEPARATOR_FORMATTER % self.__line_terminator
        if multiple_sheets:
            # will be slow for large files
            file_stream.seek(0)
            content = file_stream.read()
            sheets = content.split(separator)
            named_contents = []
            for sheet in sheets:
                if sheet == "":  # skip empty named sheet
                    continue

                lines = sheet.split(self.__line_terminator)
                result = re.match(constants.SEPARATOR_MATCHER, lines[0])
                new_content = "\n".join(lines[1:])
                new_sheet = NamedContent(result.group(1),
                                         compact.StringIO(new_content))
                named_contents.append(new_sheet)
            self.content_array = named_contents

        else:
            if hasattr(file_stream, "seek"):
                file_stream.seek(0)
            self.content_array = [NamedContent(self.file_type, file_stream)]
Example #11
0
    def __init__(self, file_name, file_type, **keywords):
        """Load content from a file
        :params str filename: an accessible file path
        :returns: a book
        """
        self.handles = []
        self.keywords = keywords
        if file_type == constants.FILE_FORMAT_TSV:
            self.keywords["dialect"] = constants.KEYWORD_TSV_DIALECT
        self.__line_terminator = keywords.get(
            constants.KEYWORD_LINE_TERMINATOR, DEFAULT_NEWLINE)
        names = os.path.splitext(file_name)
        filepattern = "%s%s*%s*%s" % (
            names[0],
            constants.DEFAULT_MULTI_CSV_SEPARATOR,
            constants.DEFAULT_MULTI_CSV_SEPARATOR,
            names[1],
        )
        filelist = glob.glob(filepattern)
        if len(filelist) == 0:
            file_parts = os.path.split(file_name)
            self.content_array = [NamedContent(file_parts[-1], file_name)]

        else:
            matcher = "%s%s(.*)%s(.*)%s" % (
                names[0],
                constants.DEFAULT_MULTI_CSV_SEPARATOR,
                constants.DEFAULT_MULTI_CSV_SEPARATOR,
                names[1],
            )
            tmp_file_list = []
            for filen in filelist:
                result = re.match(matcher, filen)
                tmp_file_list.append((result.group(1), result.group(2), filen))
            ret = []
            for lsheetname, index, filen in sorted(tmp_file_list,
                                                   key=lambda row: row[1]):
                ret.append(NamedContent(lsheetname, filen))
            self.content_array = ret
Example #12
0
    def __init__(self, file_alike_object, file_type, **keywords):
        self.content_array = []
        try:
            self.zipfile = zipfile.ZipFile(file_alike_object, "r")
            sheets = [
                NamedContent(_get_sheet_name(name), name)
                for name in self.zipfile.namelist()
            ]
            self.content_array = sheets
            self.keywords = keywords
            if file_type == constants.FILE_FORMAT_TSVZ:
                self.keywords["dialect"] = constants.KEYWORD_TSV_DIALECT

        except zipfile.BadZipfile:
            print("StringIO instance was passed by any chance?")
            raise
Example #13
0
 def __init__(self, name, payload):
     NamedContent.__init__(self, name, payload)
     self.colnames = []
Example #14
0
 def test_sheet_file_reader_delimiters(self):
     r = CSVFileReader(NamedContent(self.file_type, self.test_file), delimiters=None)
     result = list(r.to_array())
     self.assertEqual(result, [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
Example #15
0
 def test_sheet_file_reader(self):
     r = CSVFileReader(NamedContent(self.file_type, self.test_file))
     result = list(r.to_array())
     self.assertEqual(result, self.expected_data)
Example #16
0
 def test_sheet_file_reader(self):
     r = CSVFileReader(NamedContent(self.file_type, self.test_file), auto_detect_delimiter=False)
     result = list(r.to_array())
     self.assertEqual(result, [[1], [4, 5, 6], ["", 7]])
Example #17
0
 def __init__(self, name, payload):
     NamedContent.__init__(self, name, payload)
     self.colnames = []