Ejemplo n.º 1
0
def atexit_delete(filename):
    '''A utility function for threaded jobs started in CSVFixer to delete a given file.

    /filename/    Name of the file to delete.
    '''
    try:
        os.unlink(filename)
    except Exception as ex:
        logger.warning('CSVFixer: Exception "{0}" deleting file "{1}".'.format(ex, filename))
Ejemplo n.º 2
0
def test_4_2():
    jsodata = io.StringIO('{"a": 123}\n{"a": 45}')
    xout.re_init()
    try:
        p4 = Pipeline({
            'input-format': 'json',
            'header': ['a'],
            'dialect': 'nix',
            "dialects": {
                "nix": {
                    "quoting": "QUOTE_ALL",
                    "lineterminator": "\n"
                }
            }
        })
        p4(jsodata, xout)
    except Exception as ex:
        logger.warning('{1}: {0} ({2})'.format(ex,
                                               type(ex).__name__, p4.dialect))
        assert False
        # Expecting 3 lines: 123 / 45 / 2 (number of lines)
    assert xout.readlines() == [''.join(["123", "45", 2])]
Ejemplo n.º 3
0
 def log_warn(self, msg):
     logger.warning(msg)
Ejemplo n.º 4
0
def task(cwd):
    '''Task as a gevent Greenlet that processes one file name pattern.

    /cwd/       Current working directory.
    '''
    while True:
        # config  = A dict containing configuration for the task;
        # pattern = Pattern to match for input file names.
        try:
            config, pattern = jobqu.get(timeout=10)
        except Empty:
            break
        if pattern == '' or config.get('disabled', False):
            logger.debug('CSVFixer: Ignore empty pattern or disabled task')
            continue
        pattern = config.get('pattern', pattern) # 'pattern' may be configured inside task
        dest = config.get('destination', cwd)
        linkfolder = config.get('link-folder')
        forge_path(dest)
        process = Pipeline(config)
        keep_times = config.get('times', False)
        rename = [ (re.compile(xk),xv) for xk,xv in config.get('rename', {}).items() ]
        logger.debug('CSVFixer: task = %s, destination = "%s"' % (pattern, dest))
        for zipfn in glob.glob(pattern):
            stinfo = os.stat(zipfn)
            logger.debug('CSVFixer: Fixing file "{0}", mtime = {1}'.format(
                    zipfn, time.strftime('%c', time.localtime(stinfo.st_mtime))))
            if zipfn[-4:] != '.zip':
                ## Assume that it is a text CSV file if file name does not end with .zip:
                zipf = None
                ziplist = [zipfn]
            else:
                try:
                    zipf = ZipFile(zipfn)
                    ziplist = zipf.namelist()
                    logger.debug('CSVFixer: Found list in zip file = %s' % (format(ziplist)))
                except BadZipfile:
                    logger.warning('CSVFixer: zip file "%s" is bad.' % (zipfn))
                    continue
            fbasename = fwpath = ''
            for fn in ziplist:
                if fwpath == '' or config.get('file-mode') != 'a':
                    fwname = fn
                    for rex, fmt in rename:
                        mx = rex.search(fwname)
                        if mx:
                            try:
                                fwname = fmt.format(*mx.groups())
                            except Exception as ex:
                                logger.warning('Exception fixing "{0}" with "{1}" and groups = {2}'.format(fn, fmt, mx.groups()))
                            break
                    fbasename = os.path.basename(fwname)
                    fwpath = os.path.join(dest, fbasename)
                logger.debug('Processing file "{0}" to "{1}"'.format(fn, fwname))
                lines = process(open(fn, 'r') if zipf is None else zipf.open(fn, 'r'), fwpath)
                logger.debug('{0} lines processed in file "{1}"'.format(lines, fn))
                # Set fixed file's timestamps if so configured:
                if keep_times:
                    os.utime(fwpath, (stinfo.st_mtime, stinfo.st_mtime))
                    logger.debug('Set file "{0}" atime and mtime to {1}'.format(
                            fwpath, time.strftime('%c', time.localtime(stinfo.st_mtime))))
            # Archive the .zip file if configured so
            if config.get('delete', False):
                logger.debug('File "%s" registered to be deleted' % (zipfn))
                atexit.register(atexit_delete, zipfn)
            else:
                act = config.get('postprocess')
                if act != None:
                    logger.debug('File "%s" registered to be postprocessed with "%s"' % (zipfn, act))
                    atexit.register(atexit_process, zipfn, act)
            # Delete empty file if so configured:
            if fwpath != '' and config.get('delete-empty', True) and os.stat(fwpath).st_size < 1:
                os.unlink(fwpath)
                logger.debug('Deleted empty output file "{0}"'.format(fwpath))
            elif linkfolder:
                try:
                    os.link(fwpath, os.path.join(linkfolder, fbasename))
                except Exception as err:
                    logger.error('Error link file "{0}" to folder {1}: {2}'.format(fwpath, linkfolder, err))
        jobqu.task_done()
        logger.debug('Task "{0}" completed'.format(pattern))
Ejemplo n.º 5
0
    def __call__(self, fnr, fnw):
        ''' Process given file and generate output.

        /fnr/       (Name of) file to read from.
        /fnw/       (Name of) file to write to.

        The /fnr/ file is read with a default csv.DictReader() as of now, or
        a JSOReader object if explicitly configured so.
        -- May need to revise to allow handling of CSV format variations.

        Returns number of rows (records) processed in the CSV file.
        '''
        # Open files if they are given as file names:
        fin = csvio.Reader(open(fnr, 'r') if isinstance(fnr, str) else fnr, self.ends)
        fout = open(fnw, self.file_mode) if isinstance(fnw, str) else fnw
        write_header = self.write_header and (fout.tell() == 0)
        # Skip non-data if so configured:
        skip = dict({'line': 0, 'pass': 0, 'till': 0})
        skip.update(self.skip)
        lineno = 0
        while skip['more']:
            try:
                line = next(fin)
                if isinstance(line, bytes):
                    line = line.decode('utf-8')
            except StopIteration:
                logger.warning('Unexpected end-of-file when skiping to data in {0}:{1}'.format(fnr, lineno))
                return 0
            if skip['till'] and skip['till'].match(line):
                logger.debug('Skip-till matching line {0}: {1}'.format(lineno+1, line))
                fin.backup()
                break
            lineno += 1
            if (skip['pass'] and skip['pass'].match(line)) or\
               (skip['line'] and skip['line'] <= lineno):
                break
            logger.debug('Skipping line {0}: {1}'.format(lineno, line))
        # If no header is configured, assuming the next line in /fin/ is it:
        # TBD: Make output header different than input header, optionally.
        rheader = None
        header = self.header
        logger.debug('{0}: {1}to output CSV header: {2}'.format(fnw, '' if write_header else 'not ', header))
        if self.read_header or header is None:
            try:
                rheader = [ self.header_clean.sub('', x) for x in next(fin).split(',') ]
            except Exception as ex:
                logger.debug('{2}: {0} (lineno = {1})'.format(ex, lineno, type(ex).__name__))
                logger.warning('Unexpected error when reading CSV header in {0}:{1}'.format(fnr, lineno))
                return 0
            lineno += 1
            logger.debug('Read header: {0}'.format(rheader))
            for rex, fmt in self.header_fix:
                nhdr = []
                for col in rheader:
                    mx = rex.match(col)
                    if mx:
                        try:
                            col = fmt.format(*mx.groups())
                        except Exception as ex:
                            logger.warning('Exception fixing "{0}" with "{1}" and groups = {2}'.format(col, fmt, mx.groups()))
                    nhdr.append(col)
                rheader = nhdr
            logger.debug('Header fixed: {0}'.format(rheader))
        #
        # Read through the input file and write out: Read error(s) are logged but ignored.
        #
        lineno = 0
        with csvio.Writer(fout, header or rheader, write_header, self.dialect) as fw:
            # filters: Filters to pass data through. If missing, then straight thru.
            filter1 = fw
            try:
                for fltr in reversed(self.filters):
                    modname, cname = fltr.rsplit('.')
                    mod = __import__('c9r.util.filter.'+modname, fromlist=[cname])
                    klass = getattr(mod, cname)
                    filter1 = klass(filter1).open()
            except ImportError:
                logger.warning('ImportError for filter {0}'.format(fltr))
                raise
            csvreader = self.ireader(fin, fieldnames=(rheader or header))
            while True:
                try:
                    line = next(csvreader)
                    lineno += 1
                    filter1.write(line)
                except StopIteration:
                    break
                except Exception as ex:
                    logger.warning('{2}: {0} (lineno = {1})'.format(ex, lineno, type(ex).__name__))
                    logger.debug('\tline = {0})'.format(line))
                    print('-'*60)
                    traceback.print_exc(file=sys.stdout)
                    print('-'*60)
                    #logger.debug(traceback.format_tb(sys.exc_info()))
            if True:
                logger.debug('Closing filter 1: {0}, lines = {1}, fout size = {2}'.format(type(filter1).__name__, lineno, fout.tell()))
                filter1.close()
        return lineno