Esempio n. 1
0
def main(argv):
    ap = ArgumentParser(prog="generate-hidden-states")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("--report", default=False, action="store_true")
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    ap.add_argument("dimensions", nargs="+", type=int)
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    averages = categorize_rates(lstm,
                                data.stream_data(aargs.data_dir, aargs.kind),
                                aargs.dimensions, aargs.report)
    rows = [("", "0", "1")]

    for stat, dimension_points in averages.items():
        for dimension, points in dimension_points.items():
            rows += [("%s-%s" % (stat, dimension), *points)]

    with open("counter-statistics.csv", "w") as fh:
        writer = csv_writer(fh)

        for row in rows:
            writer.writerow(row)

    return 0
def main(argv):
    ap = ArgumentParser(prog="generate-hidden-states")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("-s",
                    "--sample-rate",
                    type=float,
                    default=0.1,
                    help="train then test sampling rates.")
    ap.add_argument("-d", "--dry-run", default=False, action="store_true")
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("states_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    if aargs.dry_run:
        dry_run(data.stream_data(aargs.data_dir, aargs.kind),
                aargs.sample_rate, aargs.kind)
        return 0

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    description = data.get_description(aargs.data_dir)

    if description.task == data.LM:
        annotation_fn = lambda y, i: y[i][0]
    else:
        annotation_fn = lambda y, i: y

    elicit_hidden_states(lstm, data.stream_data(aargs.data_dir,
                                                aargs.kind), annotation_fn,
                         aargs.sample_rate, aargs.states_dir, aargs.kind)
    return 0
def main(argv):
    ap = ArgumentParser(prog="query-data")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument(
        "--limit",
        type=int,
        default=10,
        help=
        "Truncate the results at maximum LIMIT.  Negative indicates to find all (unlimited)."
    )
    ap.add_argument("--match",
                    choices=["include", "sequence", "relative"],
                    default="include")
    ap.add_argument("data_dir")
    ap.add_argument("kind", choices=["train", "test"])
    ap.add_argument("words", nargs="*", default=None)
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    if aargs.match == "relative":
        # Quickest way to implement relative is just to make it correct for N = 2.
        assert len(aargs.words) == 2

    truncated = False
    count = 0

    for xy in data.stream_data(aargs.data_dir, aargs.kind):
        # TODO: work for non-lm cases.
        sequence = [item[0] for item in xy.x] + [xy.y[-1][0]]

        if matches(sequence, aargs.words, aargs.match):
            count += 1
            logging.debug("Instance: %s" % " ".join(sequence))

        if aargs.limit > 0 and count >= aargs.limit:
            logging.debug("Truncating..")
            truncated = True
            break

    user_log.info("Found %d%s instances." %
                  (count, " (truncated)" if truncated else ""))
    return 0
def main(argv):
    ap = ArgumentParser(prog="generate-activation-states")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    #ap.add_argument("-d", "--dry-run", default=False, action="store_true")
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("activations_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    description = data.get_description(aargs.data_dir)
    elicit_activation_states(lstm, data.stream_data(aargs.data_dir,
                                                    aargs.kind),
                             aargs.activations_dir)

    return 0
Esempio n. 5
0
def measure(lstm, data_dir, kind, keys):
    sequence_changes = {}
    global_minimum = {key: (None, None, None) for key in keys}
    global_maximum = {key: (None, None, None) for key in keys}

    for j, xy in enumerate(data.stream_data(data_dir, kind)):
        if j % 100 == 0:
            logging.debug("At the %d instance." % (j))

        sequence = tuple([item[0] for item in xy.x]) + (xy.y[-1][0], )
        stepwise_rnn = lstm.stepwise(handle_unknown=True)
        change_distances = {key: [] for key in keys}
        previous_states = {}
        minimum = {key: (None, None) for key in keys}
        maximum = {key: (None, None) for key in keys}

        for i, word_pos in enumerate(xy.x):
            result, instruments = stepwise_rnn.step(word_pos[0],
                                                    rnn.LSTM_INSTRUMENTS)

            for part, layer in lstm.part_layers():
                key = lstm.encode_key(part, layer)

                if key in keys:
                    current_state = instruments[part][layer]

                    if key in previous_states:
                        distance = geometry.distance(previous_states[key],
                                                     current_state)
                    else:
                        distance = geometry.hypotenuse(current_state)

                    change_distances[key] += [distance]
                    previous_states[key] = current_state

                    if minimum[key] == (None,
                                        None) or distance < minimum[key][0]:
                        minimum[key] = (distance, i)

                    if maximum[key] == (None,
                                        None) or distance > maximum[key][0]:
                        maximum[key] = (distance, i)

        for key in keys:
            if global_minimum[key] == (
                    None, None,
                    None) or minimum[key][0] < global_minimum[key][0]:
                global_minimum[key] = minimum[key] + (sequence, )
                # Only keeping track of the more notable sequence changes
                sequence_changes[sequence] = change_distances
                sequence_str, changes_str = stringify(
                    sequence, sequence_changes[sequence][key])
                logging.debug(
                    "Noting minimum for %s of %.4f @%d:\n  %s\n  %s" %
                    (key, minimum[key][0], minimum[key][1], sequence_str,
                     changes_str))

            if global_maximum[key] == (
                    None, None,
                    None) or maximum[key][0] > global_maximum[key][0]:
                global_maximum[key] = maximum[key] + (sequence, )
                # Only keeping track of the more notable sequence changes
                sequence_changes[sequence] = change_distances
                sequence_str, changes_str = stringify(
                    sequence, sequence_changes[sequence][key])
                logging.debug(
                    "Noting maximum for %s of %.4f @%d:\n  %s\n  %s" %
                    (key, maximum[key][0], maximum[key][1], sequence_str,
                     changes_str))

    return global_minimum, global_maximum, sequence_changes