コード例 #1
0
def main(argv):
    ap = ArgumentParser(prog="generate-hidden-states")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("--report", default=False, action="store_true")
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    ap.add_argument("dimensions", nargs="+", type=int)
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    averages = categorize_rates(lstm,
                                data.stream_data(aargs.data_dir, aargs.kind),
                                aargs.dimensions, aargs.report)
    rows = [("", "0", "1")]

    for stat, dimension_points in averages.items():
        for dimension, points in dimension_points.items():
            rows += [("%s-%s" % (stat, dimension), *points)]

    with open("counter-statistics.csv", "w") as fh:
        writer = csv_writer(fh)

        for row in rows:
            writer.writerow(row)

    return 0
コード例 #2
0
def main():
    ap = ArgumentParser(prog="pattern-query")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("--query-dir", default=None)
    ap.add_argument("--db-kind", choices=["postgres", "sqlite"])
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("predicate", nargs="+")
    aargs = ap.parse_args(sys.argv[1:])
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    query_engine = domain.QueryEngine(lstm, "moot", "postgres")

    predicates = Predicates(predicate_strs=aargs.predicate)
    logging.debug("invoking query: %s" % (predicates.as_strs()))
    result = query_engine.find(0.1, predicates)
    dump = json.dumps(result.as_json())
    logging.debug("result: %s" % (dump))
    print(dump)
    return 0
コード例 #3
0
def main(argv):
    ap = ArgumentParser(prog="measure-sequence-changes")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("data_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    ap.add_argument("sequential_dir")
    ap.add_argument("keys", nargs="+")
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    minimum, maximum, sequence_changes = measure(lstm, aargs.data_dir,
                                                 aargs.kind, aargs.keys)

    for key in aargs.keys:
        distance, index, sequence = minimum[key]
        sequence_str, changes_str = stringify(sequence,
                                              sequence_changes[sequence][key])
        user_log.info("Global minimum for %s of %.4f @%d:\n  %s\n  %s" %
                      (key, distance, index, sequence_str, changes_str))
        distance, index, sequence = maximum[key]
        sequence_str, changes_str = stringify(sequence,
                                              sequence_changes[sequence][key])
        user_log.info("Global maximum for %s of %.4f @%d:\n  %s\n  %s" %
                      (key, distance, index, sequence_str, changes_str))

    return 0
コード例 #4
0
def main(argv):
    ap = ArgumentParser(prog="generate-reduction-buckets")
    ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.")
    ap.add_argument("--grouping", nargs="*", default=None)
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("states_dir")
    ap.add_argument("buckets_dir")
    ap.add_argument("target", type=int)
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True)
    part_learned_mse = {}
    part_fixed_mse = {}

    if aargs.grouping is None:
        for key in lstm.keys():
            learned_mse, fixed_mse = generate_buckets(aargs.states_dir, key, lstm.part_width(key), aargs.buckets_dir, aargs.target)
            part_learned_mse[key] = learned_mse
            part_fixed_mse[key] = fixed_mse
    else:
        learned_mse, fixed_mse = generate_buckets_grouping(lstm, aargs.states_dir, aargs.grouping, aargs.buckets_dir, aargs.target)
        part_learned_mse = learned_mse
        part_fixed_mse = fixed_mse

    with open(os.path.join(aargs.buckets_dir, "analysis.csv"), "w") as fh:
        writer = csv_writer(fh)
        writer.writerow(["technique", "key", "mse"])
        total_learned = 0.0
        total_fixed = 0.0
        count_learned = 0
        count_fixed = 0

        for key, error in sorted(part_learned_mse.items()):
            total_learned += error
            count_learned += 1
            writer.writerow(["learned", key, "%f" % error])

        for key, error in sorted(part_fixed_mse.items()):
            total_fixed += error
            count_fixed += 1
            writer.writerow(["fixed", key, "%f" % error])

        user_log.info("Total scores (learned, fixed): %s, %s" % (total_learned / count_learned, total_fixed / count_fixed))

    return 0
コード例 #5
0
def main(argv):
    ap = ArgumentParser(prog="analyze-hidden-states")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    #ap.add_argument("-d", "--dry-run", default=False, action="store_true")
    ap.add_argument("--train-data", default=False, action="store_true")
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("states_dir")
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True)
    stats = {}

    for key in lstm.keys():
        train_points, test_points = states.get_hidden_states(
            aargs.states_dir, key)

        if aargs.train_data:
            stats[key] = calculate_stats(train_points)
        else:
            stats[key] = calculate_stats(test_points)

    writer = csv_writer(sys.stdout)
    writer.writerow(["key"] + sorted(stats[next(iter(lstm.keys()))].keys()))
    averages = {}
    count = 0

    for key, stats in sorted(stats.items()):
        count += 1
        writer.writerow([key] + [item[1] for item in sorted(stats.items())])

        for key, value in stats.items():
            if key not in averages:
                averages[key] = 0

            averages[key] += value

    writer.writerow(["global"] +
                    [item[1] / count for item in sorted(averages.items())])
    return 0
コード例 #6
0
def main(argv):
    ap = ArgumentParser(prog="generate-query-database")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("--key-offsets", nargs="*", default=None)
    ap.add_argument("--db-kind", choices=["sqlite", "postgres"])
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("activation_dir")
    ap.add_argument("query_dir")
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True)
    threads = []

    for parameter in (lstm.keys()
                      if aargs.key_offsets is None else aargs.key_offsets):
        key = parameter if aargs.key_offsets is None else parameter.split(
            "#")[0]
        offset = 0 if aargs.key_offsets is None else int(
            parameter.split("#")[1])
        thread = threading.Thread(target=generate_db,
                                  args=[
                                      lstm, aargs.activation_dir, key,
                                      aargs.query_dir, aargs.db_kind, offset
                                  ])
        # Non-daemon threads will keep the program running until they finish (as per documentation).
        thread.daemon = False
        thread.start()
        threads += [thread]

    for thread in threads:
        thread.join()

    return 0
コード例 #7
0
def main(argv):
    ap = ArgumentParser(prog="generate-hidden-states")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("-s",
                    "--sample-rate",
                    type=float,
                    default=0.1,
                    help="train then test sampling rates.")
    ap.add_argument("-d", "--dry-run", default=False, action="store_true")
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("states_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    if aargs.dry_run:
        dry_run(data.stream_data(aargs.data_dir, aargs.kind),
                aargs.sample_rate, aargs.kind)
        return 0

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    description = data.get_description(aargs.data_dir)

    if description.task == data.LM:
        annotation_fn = lambda y, i: y[i][0]
    else:
        annotation_fn = lambda y, i: y

    elicit_hidden_states(lstm, data.stream_data(aargs.data_dir,
                                                aargs.kind), annotation_fn,
                         aargs.sample_rate, aargs.states_dir, aargs.kind)
    return 0
コード例 #8
0
def main(argv):
    ap = ArgumentParser(prog="generate-activation-states")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    #ap.add_argument("-d", "--dry-run", default=False, action="store_true")
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("activations_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    description = data.get_description(aargs.data_dir)
    elicit_activation_states(lstm, data.stream_data(aargs.data_dir,
                                                    aargs.kind),
                             aargs.activations_dir)

    return 0
コード例 #9
0
    def __init__(self, data_dir, sequential_dir, buckets_dir, encoding_dir,
                 use_fixed_buckets):
        self.data_dir = data_dir
        self.sequential_dir = sequential_dir
        self.buckets_dir = buckets_dir
        self.encoding_dir = encoding_dir
        self.words = data.get_words(self.data_dir)
        description = data.get_description(self.data_dir)

        if description.task == data.LM:
            self.outputs = self.words
            # Map output words to their POS tags.
            pos_mapping = data.get_pos_mapping(self.data_dir)
            #self.output_mapping = lambda output: pos_mapping[output] if output in pos_mapping else "NN"
            #self.colour_mapping = pos_colour_mapping()
            self.output_mapping = lambda output: lm.COARSE_MAP[pos_mapping[
                output] if output in pos_mapping else "NN"]
            self.colour_mapping = coarse_colour_mapping()
            self.sort_key = lambda key_value: -key_value[1]
        else:
            self.outputs = data.get_outputs(self.data_dir)
            self.output_mapping = lambda output: output
            self.colour_mapping = sa_colour_mapping()
            self.sort_key = lambda key_value: sa.sentiment_sort_key(key_value[
                1])

        self.top_k = max(1,
                         int(len(self.outputs) * parameters.SEM_TOP_K_PERCENT))

        if use_fixed_buckets:
            self.bucket_mappings = reduction.get_fixed_buckets(
                self.buckets_dir)
        else:
            self.bucket_mappings = reduction.get_learned_buckets(
                self.buckets_dir)

        self.lstm = sequential.load_model(self.data_dir, self.sequential_dir)

        def _ffnn_constructor(scope, hyper_parameters, extra, case_field,
                              hidden_vector, word_labels, output_labels):
            if extra["word_input"]:
                input_field = mlbase.ConcatField(
                    [case_field, hidden_vector, word_labels])
            else:
                input_field = mlbase.ConcatField([case_field, hidden_vector])

            if extra["monolith"]:
                return model.Ffnn(scope, hyper_parameters, extra, input_field,
                                  output_labels)
            else:
                return model.SeparateFfnn(scope, hyper_parameters, extra,
                                          input_field, output_labels,
                                          case_field)

        self.sem = semantic.load_model(self.lstm,
                                       self.encoding_dir,
                                       model_fn=_ffnn_constructor)

        # TODO
        embedding_padding = tuple([0] * max(
            0, self.lstm.hyper_parameters.width -
            self.lstm.hyper_parameters.embedding_width))
        hidden_padding = tuple([0] * max(
            0, self.lstm.hyper_parameters.embedding_width -
            self.lstm.hyper_parameters.width))

        #if hasattr(model, "extra") and model.extra["word_input"]:
        #    def converter(key, hidden_state):
        #        return (key, tuple(hidden_state.point) + (embedding_padding if self.lstm.is_embedding(key) else hidden_padding), hidden_state.word)
        #else:
        def _as_input(key, point):
            return (key, tuple(point) +
                    (embedding_padding
                     if self.lstm.is_embedding(key) else hidden_padding))

        self.as_input = _as_input
        self.details_mins = {}
        self.details_maxs = {}
        self.weights_mins = {}
        self.weights_maxs = {}
コード例 #10
0
def main(argv):
    ap = ArgumentParser(prog="generate-semantic-model")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("-i", "--initial-decays", default=5, type=int)
    ap.add_argument("-c", "--convergence-decays", default=2, type=int)
    ap.add_argument("-a", "--arc-epochs", default=3, type=int)
    ap.add_argument("-l", "--layers", default=2, type=int)
    ap.add_argument("-w", "--width", default=100, type=int)
    ap.add_argument("--word-input", default=False, action="store_true")
    ap.add_argument("-p", "--pre-existing", default=False, action="store_true")
    ap.add_argument("-m", "--monolith", default=False, action="store_true")
    ap.add_argument("--key-set", nargs="*", default=None)
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("states_dir")
    ap.add_argument("encoding_dir")
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True)
    user_log.info("Sem")
    hyper_parameters = model.HyperParameters(aargs.layers, aargs.width)
    extra = {
        "word_input": aargs.word_input,
        "monolith": aargs.monolith,
    }

    if aargs.pre_existing:
        sem = load_sem(lstm, aargs.encoding_dir)
    else:
        sem = generate_sem(lstm, hyper_parameters, extra, aargs.states_dir,
                           aargs.arc_epochs, aargs.encoding_dir, aargs.key_set,
                           aargs.initial_decays, aargs.convergence_decays)

    keys_sem, total_sem = test_model(lstm, sem, aargs.states_dir, False,
                                     aargs.key_set)
    # TODO
    #user_log.info("Baseline")
    #baseline = generate_baseline(aargs.data_dir, lstm, hyper_parameters, extra)
    #scores_baseline, totals_baseline = test_model(lstm, baseline, aargs.states_dir, True, aargs.key_set)

    with open(os.path.join(aargs.encoding_dir, "analysis-breakdown.csv"),
              "w") as fh:
        writer = csv_writer(fh)
        writer.writerow(["technique", "key", "perplexity"])

        for key, perplexity in sorted(keys_sem.items()):
            writer.writerow(["sem", key, "%f" % perplexity])

        #for key, scores in sorted(scores_baseline.items()):
        #    for name, score in sorted(scores.items()):
        #        writer.writerow(["baseline", key, name, "%f" % score])

    with open(os.path.join(aargs.encoding_dir, "analysis-totals.csv"),
              "w") as fh:
        writer = csv_writer(fh)
        writer.writerow(["technique", "perplexity"])
        writer.writerow(["sem", "%f" % total_sem])

        #for name, score in sorted(totals_baseline.items()):
        #    writer.writerow(["baseline", name, "%f" % score])

    return 0