Exemple #1
0
    def _processfile(self, path_or_file, *args, **kwds):
        self.info("START")
        self._file = path_or_file
        if isinstance(path_or_file, (str, basestring)):
            self._file = File.guess(path_or_file, **getkwds(kwds, pd.read_csv))

        self.filename = self._file.basename()
        if kwds.get('testbadlines') and isinstance(self._file, Csv):
            self.info("Checking rows in '%s' for embedded delimiters." %
                      self.filename)
            self.badlines = Csv.locate_badlines(self._file.path,
                                                delimiter=self._file.delimiter)

            self.badlinescount = len(self.badlines)
            if self.badlinescount >= 1:
                self.warning("%s bad lines have been found in '%s'." %
                             (self.badlinescount, self.filename))

        _ = newfolder(kwds.get('outdir', 'processed'))
        outfile = kwds.get('outfile')
        if not outfile:
            outfile = self._file.get_outfile(self.filename, dirname=_)
        createcsv(outfile, self.fields)
        for df in self._file.dfreader:
            try:
                df = self.process(df, *args, **kwds)
                self.countsout += self.countvalues(df)
                self.normalized += len(df)
            except IncompleteExcelFile as e:
                self.incomplete_excel += 1

            File.append(outfile, df.to_csvstring(header=False))
            self.info("%s rows written to %s" % (len(df), outfile))
            gc.disable()
            gc.collect()

        self.emptysheets = getattr(self._file, 'emptysheets', None)
        self.info("END")
        print
        return self.evaluate()