def load_data(): """ read option line and configuration file, then process data import of given section, or all sections if no section is given on command line """ # first parse command line options, and set pgloader.options values # accordingly conffile, args = parse_options() # now init db connection config = parse_config(conffile) from pgloader.logger import log from pgloader.tools import read_path, check_path from pgloader.options import VERBOSE import pgloader.options if pgloader.options.REFORMAT_PATH: rpath = read_path(pgloader.options.REFORMAT_PATH, log, check = False) crpath = check_path(rpath, log) else: rpath = crpath = None if not crpath: if rpath: # don't check same path entries twice default_rpath = set(crpath) \ - set(pgloader.options.DEFAULT_REFORMAT_PATH) else: default_rpath = pgloader.options.DEFAULT_REFORMAT_PATH pgloader.options.REFORMAT_PATH = check_path(default_rpath, log) else: pgloader.options.REFORMAT_PATH = rpath log.info('Reformat path is %s', pgloader.options.REFORMAT_PATH) # load some pgloader package modules from pgloader.options import VERBOSE, DEBUG, QUIET, SUMMARY from pgloader.options import DRY_RUN, PEDANTIC, VACUUM from pgloader.options import MAX_PARALLEL_SECTIONS from pgloader.options import LOAD_FROM_STDIN, LOAD_TO_TABLE from pgloader.options import FILE_BOUNDARIES from pgloader.pgloader import PGLoader from pgloader.tools import PGLoader_Error sections = [] summary = {} # args are meant to be configuration sections, or filenames, or stdin if LOAD_FROM_STDIN: if FILE_BOUNDARIES is not None: log.warning("Can't use --boundaries on stdin") if len(args) == 0: s = '<stdin>' config.add_section(s) config.set(s, 'table', LOAD_TO_TABLE) config.set(s, 'filename', 'sys.stdin') config.set(s, 'columns', '*') config.set(s, 'format', 'csv') sections.append(s) elif len(args) == 1: if config.has_section(args[0]): # apply given section parameters, then load from stdin config.set(args[0], 'filename', 'sys.stdin') sections.append(args[0]) else: print >>sys.stderr, \ "Error: Please provide a [%s] section" % args[0] sys.exit(5) else: print >>sys.stderr, \ "Error: can't read several sections all from stdin" sys.exit(5) elif len(args) > 0: for s in args: if config.has_section(s): sections.append(s) else: log.info("Creating a section for file '%s'" % s) # a filename was given, apply [pgsql] defaults # set the tablename as the filename sans extension # consider columns = * if not os.path.exists(s): print >>sys.stderr, \ "Error: '%s' does not exists as a section nor as a file" % s sys.exit(2) config.add_section(s) config.set(s, 'table', os.path.splitext(os.path.basename(s))[0]) config.set(s, 'filename', s) config.set(s, 'columns', '*') config.set(s, 'format', 'csv') sections.append(s) else: if not LOAD_FROM_STDIN: # don't load all sections first when asked to load stdin log.debug("No argument on CLI, will consider all sections") for s in config.sections(): if s != 'pgsql': sections.append(s) # we run through sorted section list, unless we got the section list # from command line sections.sort() if FILE_BOUNDARIES is not None and len(sections) > 1: print >>sys.stderr, \ "Error: will not apply boundaries on more than one file" sys.exit(5) log.info('Will consider following sections:') for line in myprint(sections): log.info(line) # we count time passed from now on begin = time.time() threads = {} started = {} finished = {} current = 0 max_running = MAX_PARALLEL_SECTIONS if max_running == -1: max_running = len(sections) log.info('Will load %d section at a time' % max_running) sem = threading.BoundedSemaphore(max_running) while current < len(sections): s = sections[current] try: loader = None summary [s] = [] started [s] = threading.Event() finished[s] = threading.Event() try: loader = PGLoader(s, config, sem, (started[s], finished[s]), summary[s]) except PGLoader_Error, e: # could not initialize properly this loader, don't # ever wait for it started[s] .set() finished[s].set() log.error(e) if DEBUG: raise except IOError, e: # No space left on device? can't log it break if loader: if not loader.template: if FILE_BOUNDARIES is not None and len(sections) == 1: loader.reader.set_boundaries(FILE_BOUNDARIES) filename = loader.filename input_encoding = loader.input_encoding threads[s] = loader # .start() will sem.aquire(), so we won't have more # than max_running threads running at any time. log.debug("Starting a thread for %s" % s) threads[s].start() else: log.info("Skipping section %s, which is a template" % s) for d in (summary, started, finished): d.pop(s)
def load_data(): """ read option line and configuration file, then process data import of given section, or all sections if no section is given on command line """ # first parse command line options, and set pgloader.options values # accordingly conffile, args = parse_options() # now init db connection config = parse_config(conffile) from pgloader.logger import log from pgloader.tools import read_path, check_path from pgloader.options import VERBOSE import pgloader.options if pgloader.options.REFORMAT_PATH: rpath = read_path(pgloader.options.REFORMAT_PATH, log, check=False) crpath = check_path(rpath, log) else: rpath = crpath = None if not crpath: if rpath: # don't check same path entries twice default_rpath = set(crpath) \ - set(pgloader.options.DEFAULT_REFORMAT_PATH) else: default_rpath = pgloader.options.DEFAULT_REFORMAT_PATH pgloader.options.REFORMAT_PATH = check_path(default_rpath, log) else: pgloader.options.REFORMAT_PATH = rpath log.info('Reformat path is %s', pgloader.options.REFORMAT_PATH) # load some pgloader package modules from pgloader.options import VERBOSE, DEBUG, QUIET, SUMMARY from pgloader.options import DRY_RUN, PEDANTIC, VACUUM from pgloader.options import MAX_PARALLEL_SECTIONS from pgloader.options import LOAD_FROM_STDIN, LOAD_TO_TABLE from pgloader.options import FILE_BOUNDARIES from pgloader.pgloader import PGLoader from pgloader.tools import PGLoader_Error sections = [] summary = {} # args are meant to be configuration sections, or filenames, or stdin if LOAD_FROM_STDIN: if FILE_BOUNDARIES is not None: log.warning("Can't use --boundaries on stdin") if len(args) == 0: s = '<stdin>' config.add_section(s) config.set(s, 'table', LOAD_TO_TABLE) config.set(s, 'filename', 'sys.stdin') config.set(s, 'columns', '*') config.set(s, 'format', 'csv') sections.append(s) elif len(args) == 1: if config.has_section(args[0]): # apply given section parameters, then load from stdin config.set(args[0], 'filename', 'sys.stdin') sections.append(args[0]) else: print >>sys.stderr, \ "Error: Please provide a [%s] section" % args[0] sys.exit(5) else: print >>sys.stderr, \ "Error: can't read several sections all from stdin" sys.exit(5) elif len(args) > 0: for s in args: if config.has_section(s): sections.append(s) else: log.info("Creating a section for file '%s'" % s) # a filename was given, apply [pgsql] defaults # set the tablename as the filename sans extension # consider columns = * if not os.path.exists(s): print >>sys.stderr, \ "Error: '%s' does not exists as a section nor as a file" % s sys.exit(2) config.add_section(s) config.set(s, 'table', os.path.splitext(os.path.basename(s))[0]) config.set(s, 'filename', s) config.set(s, 'columns', '*') config.set(s, 'format', 'csv') sections.append(s) else: if not LOAD_FROM_STDIN: # don't load all sections first when asked to load stdin log.debug("No argument on CLI, will consider all sections") for s in config.sections(): if s != 'pgsql': sections.append(s) # we run through sorted section list, unless we got the section list # from command line sections.sort() if FILE_BOUNDARIES is not None and len(sections) > 1: print >>sys.stderr, \ "Error: will not apply boundaries on more than one file" sys.exit(5) log.info('Will consider following sections:') for line in myprint(sections): log.info(line) # we count time passed from now on begin = time.time() threads = {} started = {} finished = {} current = 0 max_running = MAX_PARALLEL_SECTIONS if max_running == -1: max_running = len(sections) log.info('Will load %d section at a time' % max_running) sem = threading.BoundedSemaphore(max_running) while current < len(sections): s = sections[current] try: loader = None summary[s] = [] started[s] = threading.Event() finished[s] = threading.Event() try: loader = PGLoader(s, config, sem, (started[s], finished[s]), summary[s]) except PGLoader_Error, e: # could not initialize properly this loader, don't # ever wait for it started[s].set() finished[s].set() log.error(e) if DEBUG: raise except IOError, e: # No space left on device? can't log it break if loader: if not loader.template: if FILE_BOUNDARIES is not None and len(sections) == 1: loader.reader.set_boundaries(FILE_BOUNDARIES) filename = loader.filename input_encoding = loader.input_encoding threads[s] = loader # .start() will sem.aquire(), so we won't have more # than max_running threads running at any time. log.debug("Starting a thread for %s" % s) threads[s].start() else: log.info("Skipping section %s, which is a template" % s) for d in (summary, started, finished): d.pop(s)
input_encoding = loader.input_encoding threads[s] = loader # .start() will sem.aquire(), so we won't have more # than max_running threads running at any time. log.debug("Starting a thread for %s" % s) threads[s].start() else: log.info("Skipping section %s, which is a template" % s) for d in (summary, started, finished): d.pop(s) except PGLoader_Error, e: if e == '': log.error('[%s] Please correct previous errors' % s) else: log.error('%s' % e) if DEBUG: raise if PEDANTIC: # was: threads[s].print_stats() # but now thread[s] is no more alive pass except UnicodeDecodeError, e: log.error("can't open '%s' with given input encoding '%s'" \ % (filename, input_encoding))