class Processor(object): """Class for processing dump files from postgresql.""" MILLION = 1024 * 1024 def __init__(self): self.bytes_count = 0 self.start_time = 0.0 self.out_files = {} self.checkpoint = Checkpoint(config.VALUE_SET) self.init_time() def init_time(self): """Init time.""" self.start_time = time.time() def add_bytes_count(self, count: int): """Add up bytes count.""" self.bytes_count += count def split_if_necessary(self) -> None: """Check size of each storage file, called each batch close and open a new one to store if size exceeds max_split_size """ # Convert MB to Byte for v in config.VALUE_SET: file_size = self.out_files[v].tell() if file_size >= config.FILE_SPLIT_SIZE: self.checkpoint.update_file_index(v) new_file = open( self.checkpoint.get_file_name(v, config.OUT_DIR), 'a') self.add_table_head(new_file) self.out_files[v].close() self.out_files[v] = new_file logging.info('File size grows over {:.2f} MB, ' 'store in new file `{}`...'.format( config.FILE_SPLIT_SIZE / self.MILLION, new_file.name)) def process_line(self, line: str) -> None: """Process each line, does NOT verify the validness of lines (print them and ignores invalid ones without terminating) check if this line is recorded, and record the line. :param line: str, line to process ('\n' not included) """ attributes = line.split('\t') try: # Check value in values to group by value = attributes[config.GROUP_BY_ATTR_INDEX] if value not in config.VALUE_SET: return row_count = int(attributes[config.INDEX_ROW_COUNT]) # Check if line is already parsed and recorded if row_count <= self.checkpoint.row_count[value]: return # Keep attributes we're interested in data = [attributes[i] for i in config.RECORD_ATTR_INDEX_LIST] # Write to related file self.out_files[value].write('\t'.join(data)) self.out_files[value].write('\n') # Update index self.checkpoint.row_count[value] = row_count except Exception as e: logging.warning(e) logging.warning("Invalid row: {}".format(attributes)) @staticmethod def verify_file_schema(fp: TextIO) -> bool: """Verify the schema of data contained in a file. The dump files of postgresql should contain exactly one table each. """ line = fp.readline() # Remember to return head of file fp.seek(0) if isinstance(line, bytes): line = str(line, encoding='utf-8') # Remove empty cells attributes = list(filter(None, line.split('\t'))) # Check attribute count if len(attributes) != config.ATTR_COUNT: return False # Check validness of index attribute try: _ = int(attributes[config.INDEX_ROW_COUNT]) except ValueError: return False return True @staticmethod def add_table_head(f: TextIO) -> None: """Add headings of table.""" f.write('\t'.join(config.RECORD_ATTR_LIST)) f.write('\n') def process_file(self, filename: str, is_old_file: bool = False) -> None: """Process a text file (ends with '.dat') or gzip file (ends with .gz). :param filename: str, name of file to process :param is_old_file: bool, whether this file has been processed before if it has been, we should skip batches already read. :return: int, 0 if this file is ignored or 1 if processed """ # Check file type file_type = filename[filename.rfind('.'):] if file_type not in config.OPEN_FUNCS: logging.info('Fail to process `{}`: unsupported file type.'.format( filename)) return # Open file according to its type fp = config.OPEN_FUNCS[file_type](filename) # Old file: needs to recover to the starting point if is_old_file and self.checkpoint.offset > 0: fp.seek(self.checkpoint.offset) logging.info('Time for seeking file offset: {:.2f} s'.format( time.time() - self.start_time)) # This should be the start of processing self.init_time() else: # New files: # needs to verify whether this file contains the table we want if not self.verify_file_schema(fp): logging.info( 'Schema of `{}` doesn\'t fit; skip.'.format(filename)) fp.close() return # Record current file self.checkpoint.current_file = filename logging.info('Start processing `{}`...'.format(filename)) while True: self.checkpoint.offset = fp.tell() batch = fp.read(config.BATCH_SIZE) # EOF line = fp.readline() if line: batch += line if not batch: break # Convert from bytes to str if needed if isinstance(batch, bytes): batch = str(batch, 'utf-8') # Parse batch for line in batch.splitlines(): self.process_line(line) self.add_bytes_count(len(batch)) # Split large files and change storage to new files if config.SPLIT: self.split_if_necessary() fp.close() def process_dir(self, dirname: str) -> None: """Recursively process files in given directory. :param dirname: str, directory of files to precess :return: number of files processed under this directory """ file_list = sorted(os.listdir(dirname)) for name in file_list: # Full name of file name = os.path.join(dirname, name) # Check if this file is already processed if name in self.checkpoint.processed_files: continue if os.path.isfile(name): self.process_file(name) self.checkpoint.processed_files.add(name) elif os.path.isdir(name) and config.RECURSIVE: self.process_dir(name) def before_process(self) -> None: """Create directory if needed, and load records.""" if not os.path.isdir(config.OUT_DIR): os.mkdir(config.OUT_DIR) # Load checkpoints from file if os.path.exists(config.RECORD_FILE): self.checkpoint.load(config.RECORD_FILE) logging.info('Checkpoint loaded from `{}`.'.format( config.RECORD_FILE)) # Open files to write for v in config.VALUE_SET: f = open(self.checkpoint.get_file_name(v, config.OUT_DIR), 'a') # If it's a new file, add headings if f.tell() == 0: self.add_table_head(f) self.out_files[v] = f def process(self, dir_list: list) -> None: """Process list of directories / files""" try: # Prepare for processing self.before_process() # Recover from file processed last time if os.path.exists(self.checkpoint.current_file): logging.info('Reloading `{}` from last checkpoints...'.format( self.checkpoint.current_file)) self.process_file(self.checkpoint.current_file, is_old_file=True) if len(dir_list) == 0: logging.error( 'Please specify at least one directory or file to process.' ) # Process each directory / file for dir_name in dir_list: if os.path.isdir(dir_name): self.process_dir(dir_name) elif os.path.isfile(dir_name): self.process_file(dir_name) else: logging.warning( '`{}` is not a directory / file; skip.'.format( dir_name)) # Ctrl + C manually stopped except KeyboardInterrupt: self.after_process(is_interrupted=True) # Other unknown exceptions... except Exception as e: logging.warning(e) self.after_process(is_interrupted=True) else: self.after_process(is_interrupted=False) def after_process(self, is_interrupted: bool) -> None: """Deal with opened files, useless files and save records.""" # Close files, and remove files with zero contents head_len = len('\t'.join(config.RECORD_ATTR_LIST)) + 1 for file in self.out_files.values(): file.close() # Not strictly compare size if os.path.getsize(file.name) <= head_len + 100: os.remove(file.name) # Handle interrupts if is_interrupted: self.checkpoint.save(config.RECORD_FILE) logging.info('Checkpoint saved in `{}`.'.format( config.RECORD_FILE)) # Normal ending, remove record file elif os.path.exists(config.RECORD_FILE): os.remove(config.RECORD_FILE) # Analyse speed total_mb = self.bytes_count / self.MILLION total_time = time.time() - self.start_time avg_speed = total_mb / total_time logging.info( 'Processed {:.2f} MB in {:.2f} s, {:.2f} MB/s on average.'.format( total_mb, total_time, avg_speed)) exit(int(is_interrupted))