def main(args): import optparse p = optparse.OptionParser( '%prog [INFILE [OUTFILE]]') opts, args = p.parse_args(args) if len(args) > 2: sys.stderr.write('We only support 2 filenames, not %d\n' % (len(args),)) return -1 cleanups = [] try: if len(args) == 0: infile = sys.stdin insize = None outfile = sys.stdout else: infile, cleanup = files.open_file(args[0]) if cleanup is not None: cleanups.append(cleanup) if isinstance(infile, file): # pipes are files, but 0 isn't useful. insize = os.fstat(infile.fileno()).st_size or None else: insize = None if len(args) == 1: outfile = sys.stdout else: outfile = open(args[1], 'wb') strip_duplicate(infile, outfile, insize) finally: for cleanup in cleanups: cleanup()
def main(args): import optparse p = optparse.OptionParser('%prog [INFILE [OUTFILE]]') opts, args = p.parse_args(args) if len(args) > 2: sys.stderr.write('We only support 2 filenames, not %d\n' % (len(args), )) return -1 cleanups = [] try: if len(args) == 0: infile = sys.stdin insize = None outfile = sys.stdout else: infile, cleanup = files.open_file(args[0]) if cleanup is not None: cleanups.append(cleanup) if isinstance(infile, file): # pipes are files, but 0 isn't useful. insize = os.fstat(infile.fileno()).st_size or None else: insize = None if len(args) == 1: outfile = sys.stdout else: outfile = open(args[1], 'wb') strip_duplicate(infile, outfile, insize) finally: for cleanup in cleanups: cleanup()
def load(source, using_json=None, show_prog=True, collapse=True, max_parents=None): """Load objects from the given source. :param source: If this is a string, we will open it as a file and read all objects. For any other type, we will simply iterate and parse objects out, so the object should be an iterator of json lines. :param using_json: Use simplejson rather than the regex. This allows arbitrary ordered json dicts to be parsed but still requires per-line layout. Set to 'False' to indicate you want to use the regex, set to 'True' to force using simplejson. None will probe to see if simplejson is available, and use it if it is. (With _speedups built, simplejson parses faster and more accurately than the regex.) :param show_prog: If True, display the progress as we read in data :param collapse: If True, run collapse_instance_dicts() after loading. :param max_parents: See ObjManager.__init__(max_parents) """ cleanup = None if isinstance(source, str): source, cleanup = files.open_file(source) if isinstance(source, file): input_size = os.fstat(source.fileno()).st_size else: input_size = 0 elif isinstance(source, (list, tuple)): input_size = sum(map(len, source)) else: input_size = 0 if using_json is None: using_json = (simplejson is not None) try: manager = _load(source, using_json, show_prog, input_size, max_parents=max_parents) finally: if cleanup is not None: cleanup() if collapse: tstart = time.time() if not manager.collapse_instance_dicts(): manager.compute_parents() if show_prog: tend = time.time() sys.stderr.write('collapsed in %.1fs\n' % (tend - tstart, )) return manager
def load(source, using_json=None, show_prog=True, collapse=True, max_parents=None): """Load objects from the given source. :param source: If this is a string, we will open it as a file and read all objects. For any other type, we will simply iterate and parse objects out, so the object should be an iterator of json lines. :param using_json: Use simplejson rather than the regex. This allows arbitrary ordered json dicts to be parsed but still requires per-line layout. Set to 'False' to indicate you want to use the regex, set to 'True' to force using simplejson. None will probe to see if simplejson is available, and use it if it is. (With _speedups built, simplejson parses faster and more accurately than the regex.) :param show_prog: If True, display the progress as we read in data :param collapse: If True, run collapse_instance_dicts() after loading. :param max_parents: See ObjManager.__init__(max_parents) """ cleanup = None if isinstance(source, str): source, cleanup = files.open_file(source) if isinstance(source, file): input_size = os.fstat(source.fileno()).st_size else: input_size = 0 elif isinstance(source, (list, tuple)): input_size = sum(map(len, source)) else: input_size = 0 if using_json is None: using_json = (simplejson is not None) try: manager = _load(source, using_json, show_prog, input_size, max_parents=max_parents) finally: if cleanup is not None: cleanup() if collapse: tstart = time.time() if not manager.collapse_instance_dicts(): manager.compute_parents() if show_prog: tend = time.time() sys.stderr.write('collapsed in %.1fs\n' % (tend - tstart,)) return manager
def source(): infile, cleanup = files.open_file(args[0]) for obj in loader.iter_objs(infile): yield obj cleanup()