def main(argv): ap = ArgumentParser(prog="generate-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--report", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) ap.add_argument("dimensions", nargs="+", type=int) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) averages = categorize_rates(lstm, data.stream_data(aargs.data_dir, aargs.kind), aargs.dimensions, aargs.report) rows = [("", "0", "1")] for stat, dimension_points in averages.items(): for dimension, points in dimension_points.items(): rows += [("%s-%s" % (stat, dimension), *points)] with open("counter-statistics.csv", "w") as fh: writer = csv_writer(fh) for row in rows: writer.writerow(row) return 0
def main(argv): ap = ArgumentParser(prog="generate-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("-s", "--sample-rate", type=float, default=0.1, help="train then test sampling rates.") ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) if aargs.dry_run: dry_run(data.stream_data(aargs.data_dir, aargs.kind), aargs.sample_rate, aargs.kind) return 0 lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) description = data.get_description(aargs.data_dir) if description.task == data.LM: annotation_fn = lambda y, i: y[i][0] else: annotation_fn = lambda y, i: y elicit_hidden_states(lstm, data.stream_data(aargs.data_dir, aargs.kind), annotation_fn, aargs.sample_rate, aargs.states_dir, aargs.kind) return 0
def main(argv): ap = ArgumentParser(prog="query-data") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument( "--limit", type=int, default=10, help= "Truncate the results at maximum LIMIT. Negative indicates to find all (unlimited)." ) ap.add_argument("--match", choices=["include", "sequence", "relative"], default="include") ap.add_argument("data_dir") ap.add_argument("kind", choices=["train", "test"]) ap.add_argument("words", nargs="*", default=None) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) if aargs.match == "relative": # Quickest way to implement relative is just to make it correct for N = 2. assert len(aargs.words) == 2 truncated = False count = 0 for xy in data.stream_data(aargs.data_dir, aargs.kind): # TODO: work for non-lm cases. sequence = [item[0] for item in xy.x] + [xy.y[-1][0]] if matches(sequence, aargs.words, aargs.match): count += 1 logging.debug("Instance: %s" % " ".join(sequence)) if aargs.limit > 0 and count >= aargs.limit: logging.debug("Truncating..") truncated = True break user_log.info("Found %d%s instances." % (count, " (truncated)" if truncated else "")) return 0
def main(argv): ap = ArgumentParser(prog="generate-activation-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") #ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("activations_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) description = data.get_description(aargs.data_dir) elicit_activation_states(lstm, data.stream_data(aargs.data_dir, aargs.kind), aargs.activations_dir) return 0
def measure(lstm, data_dir, kind, keys): sequence_changes = {} global_minimum = {key: (None, None, None) for key in keys} global_maximum = {key: (None, None, None) for key in keys} for j, xy in enumerate(data.stream_data(data_dir, kind)): if j % 100 == 0: logging.debug("At the %d instance." % (j)) sequence = tuple([item[0] for item in xy.x]) + (xy.y[-1][0], ) stepwise_rnn = lstm.stepwise(handle_unknown=True) change_distances = {key: [] for key in keys} previous_states = {} minimum = {key: (None, None) for key in keys} maximum = {key: (None, None) for key in keys} for i, word_pos in enumerate(xy.x): result, instruments = stepwise_rnn.step(word_pos[0], rnn.LSTM_INSTRUMENTS) for part, layer in lstm.part_layers(): key = lstm.encode_key(part, layer) if key in keys: current_state = instruments[part][layer] if key in previous_states: distance = geometry.distance(previous_states[key], current_state) else: distance = geometry.hypotenuse(current_state) change_distances[key] += [distance] previous_states[key] = current_state if minimum[key] == (None, None) or distance < minimum[key][0]: minimum[key] = (distance, i) if maximum[key] == (None, None) or distance > maximum[key][0]: maximum[key] = (distance, i) for key in keys: if global_minimum[key] == ( None, None, None) or minimum[key][0] < global_minimum[key][0]: global_minimum[key] = minimum[key] + (sequence, ) # Only keeping track of the more notable sequence changes sequence_changes[sequence] = change_distances sequence_str, changes_str = stringify( sequence, sequence_changes[sequence][key]) logging.debug( "Noting minimum for %s of %.4f @%d:\n %s\n %s" % (key, minimum[key][0], minimum[key][1], sequence_str, changes_str)) if global_maximum[key] == ( None, None, None) or maximum[key][0] > global_maximum[key][0]: global_maximum[key] = maximum[key] + (sequence, ) # Only keeping track of the more notable sequence changes sequence_changes[sequence] = change_distances sequence_str, changes_str = stringify( sequence, sequence_changes[sequence][key]) logging.debug( "Noting maximum for %s of %.4f @%d:\n %s\n %s" % (key, maximum[key][0], maximum[key][1], sequence_str, changes_str)) return global_minimum, global_maximum, sequence_changes