def main(argv): ap = ArgumentParser(prog="generate-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--report", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) ap.add_argument("dimensions", nargs="+", type=int) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) averages = categorize_rates(lstm, data.stream_data(aargs.data_dir, aargs.kind), aargs.dimensions, aargs.report) rows = [("", "0", "1")] for stat, dimension_points in averages.items(): for dimension, points in dimension_points.items(): rows += [("%s-%s" % (stat, dimension), *points)] with open("counter-statistics.csv", "w") as fh: writer = csv_writer(fh) for row in rows: writer.writerow(row) return 0
def main(): ap = ArgumentParser(prog="pattern-query") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--query-dir", default=None) ap.add_argument("--db-kind", choices=["postgres", "sqlite"]) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("predicate", nargs="+") aargs = ap.parse_args(sys.argv[1:]) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) query_engine = domain.QueryEngine(lstm, "moot", "postgres") predicates = Predicates(predicate_strs=aargs.predicate) logging.debug("invoking query: %s" % (predicates.as_strs())) result = query_engine.find(0.1, predicates) dump = json.dumps(result.as_json()) logging.debug("result: %s" % (dump)) print(dump) return 0
def main(argv): ap = ArgumentParser(prog="measure-sequence-changes") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("data_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) ap.add_argument("sequential_dir") ap.add_argument("keys", nargs="+") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) minimum, maximum, sequence_changes = measure(lstm, aargs.data_dir, aargs.kind, aargs.keys) for key in aargs.keys: distance, index, sequence = minimum[key] sequence_str, changes_str = stringify(sequence, sequence_changes[sequence][key]) user_log.info("Global minimum for %s of %.4f @%d:\n %s\n %s" % (key, distance, index, sequence_str, changes_str)) distance, index, sequence = maximum[key] sequence_str, changes_str = stringify(sequence, sequence_changes[sequence][key]) user_log.info("Global maximum for %s of %.4f @%d:\n %s\n %s" % (key, distance, index, sequence_str, changes_str)) return 0
def main(argv): ap = ArgumentParser(prog="generate-reduction-buckets") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--grouping", nargs="*", default=None) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") ap.add_argument("buckets_dir") ap.add_argument("target", type=int) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) part_learned_mse = {} part_fixed_mse = {} if aargs.grouping is None: for key in lstm.keys(): learned_mse, fixed_mse = generate_buckets(aargs.states_dir, key, lstm.part_width(key), aargs.buckets_dir, aargs.target) part_learned_mse[key] = learned_mse part_fixed_mse[key] = fixed_mse else: learned_mse, fixed_mse = generate_buckets_grouping(lstm, aargs.states_dir, aargs.grouping, aargs.buckets_dir, aargs.target) part_learned_mse = learned_mse part_fixed_mse = fixed_mse with open(os.path.join(aargs.buckets_dir, "analysis.csv"), "w") as fh: writer = csv_writer(fh) writer.writerow(["technique", "key", "mse"]) total_learned = 0.0 total_fixed = 0.0 count_learned = 0 count_fixed = 0 for key, error in sorted(part_learned_mse.items()): total_learned += error count_learned += 1 writer.writerow(["learned", key, "%f" % error]) for key, error in sorted(part_fixed_mse.items()): total_fixed += error count_fixed += 1 writer.writerow(["fixed", key, "%f" % error]) user_log.info("Total scores (learned, fixed): %s, %s" % (total_learned / count_learned, total_fixed / count_fixed)) return 0
def main(argv): ap = ArgumentParser(prog="analyze-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") #ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("--train-data", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) stats = {} for key in lstm.keys(): train_points, test_points = states.get_hidden_states( aargs.states_dir, key) if aargs.train_data: stats[key] = calculate_stats(train_points) else: stats[key] = calculate_stats(test_points) writer = csv_writer(sys.stdout) writer.writerow(["key"] + sorted(stats[next(iter(lstm.keys()))].keys())) averages = {} count = 0 for key, stats in sorted(stats.items()): count += 1 writer.writerow([key] + [item[1] for item in sorted(stats.items())]) for key, value in stats.items(): if key not in averages: averages[key] = 0 averages[key] += value writer.writerow(["global"] + [item[1] / count for item in sorted(averages.items())]) return 0
def main(argv): ap = ArgumentParser(prog="generate-query-database") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--key-offsets", nargs="*", default=None) ap.add_argument("--db-kind", choices=["sqlite", "postgres"]) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("activation_dir") ap.add_argument("query_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) threads = [] for parameter in (lstm.keys() if aargs.key_offsets is None else aargs.key_offsets): key = parameter if aargs.key_offsets is None else parameter.split( "#")[0] offset = 0 if aargs.key_offsets is None else int( parameter.split("#")[1]) thread = threading.Thread(target=generate_db, args=[ lstm, aargs.activation_dir, key, aargs.query_dir, aargs.db_kind, offset ]) # Non-daemon threads will keep the program running until they finish (as per documentation). thread.daemon = False thread.start() threads += [thread] for thread in threads: thread.join() return 0
def main(argv): ap = ArgumentParser(prog="generate-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("-s", "--sample-rate", type=float, default=0.1, help="train then test sampling rates.") ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) if aargs.dry_run: dry_run(data.stream_data(aargs.data_dir, aargs.kind), aargs.sample_rate, aargs.kind) return 0 lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) description = data.get_description(aargs.data_dir) if description.task == data.LM: annotation_fn = lambda y, i: y[i][0] else: annotation_fn = lambda y, i: y elicit_hidden_states(lstm, data.stream_data(aargs.data_dir, aargs.kind), annotation_fn, aargs.sample_rate, aargs.states_dir, aargs.kind) return 0
def main(argv): ap = ArgumentParser(prog="generate-activation-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") #ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("activations_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) description = data.get_description(aargs.data_dir) elicit_activation_states(lstm, data.stream_data(aargs.data_dir, aargs.kind), aargs.activations_dir) return 0
def __init__(self, data_dir, sequential_dir, buckets_dir, encoding_dir, use_fixed_buckets): self.data_dir = data_dir self.sequential_dir = sequential_dir self.buckets_dir = buckets_dir self.encoding_dir = encoding_dir self.words = data.get_words(self.data_dir) description = data.get_description(self.data_dir) if description.task == data.LM: self.outputs = self.words # Map output words to their POS tags. pos_mapping = data.get_pos_mapping(self.data_dir) #self.output_mapping = lambda output: pos_mapping[output] if output in pos_mapping else "NN" #self.colour_mapping = pos_colour_mapping() self.output_mapping = lambda output: lm.COARSE_MAP[pos_mapping[ output] if output in pos_mapping else "NN"] self.colour_mapping = coarse_colour_mapping() self.sort_key = lambda key_value: -key_value[1] else: self.outputs = data.get_outputs(self.data_dir) self.output_mapping = lambda output: output self.colour_mapping = sa_colour_mapping() self.sort_key = lambda key_value: sa.sentiment_sort_key(key_value[ 1]) self.top_k = max(1, int(len(self.outputs) * parameters.SEM_TOP_K_PERCENT)) if use_fixed_buckets: self.bucket_mappings = reduction.get_fixed_buckets( self.buckets_dir) else: self.bucket_mappings = reduction.get_learned_buckets( self.buckets_dir) self.lstm = sequential.load_model(self.data_dir, self.sequential_dir) def _ffnn_constructor(scope, hyper_parameters, extra, case_field, hidden_vector, word_labels, output_labels): if extra["word_input"]: input_field = mlbase.ConcatField( [case_field, hidden_vector, word_labels]) else: input_field = mlbase.ConcatField([case_field, hidden_vector]) if extra["monolith"]: return model.Ffnn(scope, hyper_parameters, extra, input_field, output_labels) else: return model.SeparateFfnn(scope, hyper_parameters, extra, input_field, output_labels, case_field) self.sem = semantic.load_model(self.lstm, self.encoding_dir, model_fn=_ffnn_constructor) # TODO embedding_padding = tuple([0] * max( 0, self.lstm.hyper_parameters.width - self.lstm.hyper_parameters.embedding_width)) hidden_padding = tuple([0] * max( 0, self.lstm.hyper_parameters.embedding_width - self.lstm.hyper_parameters.width)) #if hasattr(model, "extra") and model.extra["word_input"]: # def converter(key, hidden_state): # return (key, tuple(hidden_state.point) + (embedding_padding if self.lstm.is_embedding(key) else hidden_padding), hidden_state.word) #else: def _as_input(key, point): return (key, tuple(point) + (embedding_padding if self.lstm.is_embedding(key) else hidden_padding)) self.as_input = _as_input self.details_mins = {} self.details_maxs = {} self.weights_mins = {} self.weights_maxs = {}
def main(argv): ap = ArgumentParser(prog="generate-semantic-model") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("-i", "--initial-decays", default=5, type=int) ap.add_argument("-c", "--convergence-decays", default=2, type=int) ap.add_argument("-a", "--arc-epochs", default=3, type=int) ap.add_argument("-l", "--layers", default=2, type=int) ap.add_argument("-w", "--width", default=100, type=int) ap.add_argument("--word-input", default=False, action="store_true") ap.add_argument("-p", "--pre-existing", default=False, action="store_true") ap.add_argument("-m", "--monolith", default=False, action="store_true") ap.add_argument("--key-set", nargs="*", default=None) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") ap.add_argument("encoding_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) user_log.info("Sem") hyper_parameters = model.HyperParameters(aargs.layers, aargs.width) extra = { "word_input": aargs.word_input, "monolith": aargs.monolith, } if aargs.pre_existing: sem = load_sem(lstm, aargs.encoding_dir) else: sem = generate_sem(lstm, hyper_parameters, extra, aargs.states_dir, aargs.arc_epochs, aargs.encoding_dir, aargs.key_set, aargs.initial_decays, aargs.convergence_decays) keys_sem, total_sem = test_model(lstm, sem, aargs.states_dir, False, aargs.key_set) # TODO #user_log.info("Baseline") #baseline = generate_baseline(aargs.data_dir, lstm, hyper_parameters, extra) #scores_baseline, totals_baseline = test_model(lstm, baseline, aargs.states_dir, True, aargs.key_set) with open(os.path.join(aargs.encoding_dir, "analysis-breakdown.csv"), "w") as fh: writer = csv_writer(fh) writer.writerow(["technique", "key", "perplexity"]) for key, perplexity in sorted(keys_sem.items()): writer.writerow(["sem", key, "%f" % perplexity]) #for key, scores in sorted(scores_baseline.items()): # for name, score in sorted(scores.items()): # writer.writerow(["baseline", key, name, "%f" % score]) with open(os.path.join(aargs.encoding_dir, "analysis-totals.csv"), "w") as fh: writer = csv_writer(fh) writer.writerow(["technique", "perplexity"]) writer.writerow(["sem", "%f" % total_sem]) #for name, score in sorted(totals_baseline.items()): # writer.writerow(["baseline", name, "%f" % score]) return 0