def _write_set(self, sset, apps_sizes): for app, size in apps_sizes: stage_stats = StageStats.get_stats(app.stages) # Bug in bytes read metrics reported by Spark stage_stats[1] = size self._writer.writerow([len(app.slaves), sset, size, app.records_read, app.duration, Parser.fits_in_memory(app)] + stage_stats)
def get_row(log): """Return a row using only one LogParser instance.""" parser = LogParser() app = parser.parse_file(log) input_bytes = app.bytes_read input_records = app.records_read sset = HBKmeansParser.get_set(input_records) stage_stats = StageStats.get_stats(app.stages) return [len(app.slaves), sset, input_bytes, input_records, app.duration, Parser.fits_in_memory(app)] + stage_stats
def run(self): """Parse logs and extract relevant information.""" self.start() # CSV files csv_gen = CSVGen() app = HBSortParser.get_app() stage_titles = StageStats.get_titles(app.stages) header = ['workers', 'set', 'input_bytes', 'input_records', 'duration_ms', 'in_memory'] + stage_titles writer = csv_gen.get_writer(header, self.filename) for app in HBSortParser.get_apps(): size = app.bytes_read sset = HBSortParser.get_set(size) stage_stats = StageStats.get_stats(app.stages) row = [len(app.slaves), sset, size, app.records_read, app.duration, Parser.fits_in_memory(app)] + stage_stats writer.writerow(row) csv_gen.close() self.finish()