def load_and_preprocess_data(config): logger.info("Loading training data...") train_emails = read_records(config.train_filepath) logger.info("Done. Read %d emails", len(train_emails)) logger.info("Loading dev data...") dev_emails = read_records(config.dev_filepath) logger.info("Done. Read %d emails", len(dev_emails)) logger.info("Loading test data...") test_emails = read_records(config.test_filepath) logger.info("Done. Read %d emails", len(test_emails)) return train_emails, dev_emails, test_emails
def snap_records(combined_seg, segments_index, infile, record_type, startyear=None, endyear=None): records = util.read_records(infile, record_type, startyear, endyear) if record_type == 'concern' and not records: print "no concerns found" return # Find nearest crashes - 30 tolerance print "snapping " + record_type + " records to segments" util.find_nearest(records, combined_seg, segments_index, 30, type_record=True) # Write out snapped records schema = records[0].schema shpfile = os.path.join(MAP_FP, record_type + '_joined.shp') util.records_to_shapefile(schema, shpfile, records) jsonfile = os.path.join(PROCESSED_DATA_FP, record_type + '_joined.json') print "output " + record_type + " data to " + jsonfile with open(jsonfile, 'w') as f: json.dump([r.properties for r in records], f)
def plot_CDF(filename, start_selector, end_selector, start_index=None, end_index=None, **kwargs): records = read_records(filename) lineages = preprocess(records, cmd_of_interest='', send_only=False) intervals = get_intervals(lineages, start_selector, end_selector, start_index=start_index, end_index=end_index, **kwargs) sortedtime = np.sort(intervals.values()) p = 1. * np.arange(len(intervals.values())) / (len(intervals.values()) - 1) plt.plot(sortedtime, p, **kwargs)
def running_time(filename): records = read_records(filename) total_time = 0.0 running = 0 last_ts = records[0]['ts'] for r in records: if r['op'] == 'recv' and r['msg'] == 'lambda_start_ts': total_time += (r['ts'] - last_ts) * running running += 1 last_ts = r['ts'] elif r['op'] == 'send' and r['msg'] == 'quit:': total_time += (r['ts'] - last_ts) * running running -= 1 last_ts = r['ts'] assert (running == 0) return total_time
def plot_lineages(filename, start_selector, end_selector, start_index=None, end_index=None, **kwargs): records = read_records(filename) lineages = preprocess(records, cmd_of_interest='', send_only=False) intervals = get_intervals(lineages, start_selector, end_selector, start_index=start_index, end_index=end_index, **kwargs) items = intervals.items() items.sort(key=lambda i: int(i[0])) return plt.plot([i[0] for i in items], [i[1] for i in items], **kwargs)
def get_completion_time(filename): records = read_records(filename) lineages = preprocess(records, cmd_of_interest='', send_only=False) lineage_time = [l[-1]['ts'] for l in lineages.values()] return np.percentile(lineage_time, (95, 99, 100))
color='k', linestyle='-', linewidth=2) if __name__ == '__main__': parser = argparse.ArgumentParser(description="plot stack graphs") parser.add_argument("logfile", help="pipeline log file", type=str) parser.add_argument("-v", "--verbose", help="show ts of all commands", action="store_true") parser.add_argument("-s", "--sort", help="sort by completion time", action="store_true") parser.add_argument("--chunklen", help="chunk length", type=float) parser.add_argument("--playtime", help="virtual playback start time", type=float) args = parser.parse_args() lines = read_records(sys.argv[1]) plot_stack(lines, chunk_length=args.chunklen, ystart=args.playtime, verbose=args.verbose, sort_by_completion_time=args.sort) plt.show()