def sort(path, threshold, tmpdir=None): tmpfiles = [] # Split and Sort if os.stat(path).st_size > threshold: tslines = _read_raw_file(path) for chunk in _slice_tsfile(tslines, threshold): chunk.sort() # Write temp file fd, fdpath = mkstemp(prefix=SORT_FILE_PREFIX, dir=tmpdir) fd = os.fdopen(fd, 'w+t') for line in chunk: fd.write('%s %s\n' % line) tmpfiles.append((fd, fdpath)) # Merge Temp Files for ts, data in merge(*tuple(_read_raw_fd(fd) for fd, _ in tmpfiles)): yield msec_to_timestamp(int(ts)), data for fd, fd_path in tmpfiles: fd.close() os.unlink(fd_path) else: tslines = list(_read_raw_file(path)) tslines.sort() for ts, data in tslines: yield msec_to_timestamp(int(ts)), data
def _read_file(path): for file_cls in (GzipFile, BZ2File, open): fd = file_cls(path) try: for line in fd: ts, data = line.strip().split(' ', 1) yield msec_to_timestamp(int(ts)), data break except IOError: pass finally: fd.close()