def main(argv): ap = ArgumentParser(prog="measure-sequence-changes") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("data_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) ap.add_argument("sequential_dir") ap.add_argument("keys", nargs="+") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) minimum, maximum, sequence_changes = measure(lstm, aargs.data_dir, aargs.kind, aargs.keys) for key in aargs.keys: distance, index, sequence = minimum[key] sequence_str, changes_str = stringify(sequence, sequence_changes[sequence][key]) user_log.info("Global minimum for %s of %.4f @%d:\n %s\n %s" % (key, distance, index, sequence_str, changes_str)) distance, index, sequence = maximum[key] sequence_str, changes_str = stringify(sequence, sequence_changes[sequence][key]) user_log.info("Global maximum for %s of %.4f @%d:\n %s\n %s" % (key, distance, index, sequence_str, changes_str)) return 0
def main(argv): ap = ArgumentParser(prog="generate-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--report", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) ap.add_argument("dimensions", nargs="+", type=int) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) averages = categorize_rates(lstm, data.stream_data(aargs.data_dir, aargs.kind), aargs.dimensions, aargs.report) rows = [("", "0", "1")] for stat, dimension_points in averages.items(): for dimension, points in dimension_points.items(): rows += [("%s-%s" % (stat, dimension), *points)] with open("counter-statistics.csv", "w") as fh: writer = csv_writer(fh) for row in rows: writer.writerow(row) return 0
def main(): ap = ArgumentParser(prog="pattern-query") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--query-dir", default=None) ap.add_argument("--db-kind", choices=["postgres", "sqlite"]) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("predicate", nargs="+") aargs = ap.parse_args(sys.argv[1:]) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) query_engine = domain.QueryEngine(lstm, "moot", "postgres") predicates = Predicates(predicate_strs=aargs.predicate) logging.debug("invoking query: %s" % (predicates.as_strs())) result = query_engine.find(0.1, predicates) dump = json.dumps(result.as_json()) logging.debug("result: %s" % (dump)) print(dump) return 0
def main(argv): ap = ArgumentParser(prog="generate-data") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") #ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("task", help="Either 'sa' or 'lm'.") ap.add_argument("form", help="How the language data should be interpreted:\n" \ "raw: the text is raw (must still be run through a tokenizer)." \ "tokenized: the text has been tokenized (space separate tokens, new lines separate sentences)." \ "ptb: the text is tokenized and pos tagged in Penn Treebank form.") ap.add_argument("corpus_paths", nargs="+") ap.add_argument("data_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) if aargs.task == "sa": assert aargs.form == "tokenized" train_xys, validation_xys, test_xys = sa.create(aargs.data_dir, lambda: stream_input_stanford(aargs.corpus_paths[0])) elif aargs.task == "lm": train_xys, validation_xys, test_xys = lm.create(aargs.data_dir, lambda: stream_input_text(aargs.corpus_paths, aargs.form)) else: raise ValueError("Unknown task: %s" % aargs.task) logging.debug("data sets (train, validation, test): %d, %d, %d" % (len(train_xys), len(validation_xys), len(test_xys))) return 0
def main(argv): ap = ArgumentParser(prog="markdowner") ap.add_argument("--verbose", "-v", default=False, action="store_true", help="Turn on verbose logging. " + \ "**This will SIGNIFICANTLY slow down the program.**") ap.add_argument("markdown_file") ap.add_argument("output_html") args = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], args.verbose, True) logging.debug(args) with open(args.markdown_file, "r") as fh: html = markdown.markdown(fh.read(), extensions=EXTENSIONS, output_format="html5") with open(args.output_html, "w") as fh: fh.write(html_prefix) fh.write(html) fh.write(html_suffix) return 0
def main(argv): ap = ArgumentParser(prog="server") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("-p", "--port", default=8888, type=int) ap.add_argument("--query-dir", default=None) ap.add_argument("--db-kind", choices=["postgres", "sqlite"]) ap.add_argument("--use-fixed-buckets", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("buckets_dir") ap.add_argument("encoding_dir") aargs = ap.parse_args(argv) #patch_thread_for_profiling() setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) words = data.get_words(aargs.data_dir) neural_network = domain.NeuralNetwork(aargs.data_dir, aargs.sequential_dir, aargs.buckets_dir, aargs.encoding_dir, aargs.use_fixed_buckets) # Quick test for seeing which mechanism is fastest for hitting the lstm. #logging.info("start") #for i in range(3): # for test_sequence in data.stream_test(aargs.data_dir): # for j in range(len(test_sequence.x)): # neural_network.query_lstm([item[0] for item in test_sequence.x[:j + 1]], rnn.LSTM_INSTRUMENTS, False) #logging.info("stop") #sys.exit(1) query_engine = None pattern_engine = None if aargs.query_dir is not None: query_engine = domain.QueryEngine(neural_network.lstm, aargs.query_dir, aargs.db_kind) pattern_engine = domain.PatternEngine(neural_network.lstm) run_server(aargs.port, words, neural_network, query_engine, pattern_engine) try: neural_network._background_setup.join() except KeyboardInterrupt as e: if patched: neural_network._background_setup.complete_profile() raise e return 0
def main(argv): ap = ArgumentParser(prog="analyze-data") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("data_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) analyze(data.stream_train(aargs.data_dir), "train") analyze(data.stream_test(aargs.data_dir), "test") return 0
def main(argv): ap = ArgumentParser(prog="server") ap.add_argument("--verbose", "-v", default=False, action="store_true", # Unfortunately, logging in python 2.7 doesn't have # a built-in way to log asynchronously. help="Turn on verbose logging. " + \ "**This will SIGNIFICANTLY slow down the program.**") ap.add_argument("-p", "--port", default=8888, type=int) args = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], args.verbose, True) logging.debug(args) fe_converter = FeConverter() run(args.port, fe_converter)
def main(argv): ap = ArgumentParser(prog="query-data") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument( "--limit", type=int, default=10, help= "Truncate the results at maximum LIMIT. Negative indicates to find all (unlimited)." ) ap.add_argument("--match", choices=["include", "sequence", "relative"], default="include") ap.add_argument("data_dir") ap.add_argument("kind", choices=["train", "test"]) ap.add_argument("words", nargs="*", default=None) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) if aargs.match == "relative": # Quickest way to implement relative is just to make it correct for N = 2. assert len(aargs.words) == 2 truncated = False count = 0 for xy in data.stream_data(aargs.data_dir, aargs.kind): # TODO: work for non-lm cases. sequence = [item[0] for item in xy.x] + [xy.y[-1][0]] if matches(sequence, aargs.words, aargs.match): count += 1 logging.debug("Instance: %s" % " ".join(sequence)) if aargs.limit > 0 and count >= aargs.limit: logging.debug("Truncating..") truncated = True break user_log.info("Found %d%s instances." % (count, " (truncated)" if truncated else "")) return 0
def main(argv): ap = ArgumentParser(prog="generate-reduction-buckets") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--grouping", nargs="*", default=None) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") ap.add_argument("buckets_dir") ap.add_argument("target", type=int) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) part_learned_mse = {} part_fixed_mse = {} if aargs.grouping is None: for key in lstm.keys(): learned_mse, fixed_mse = generate_buckets(aargs.states_dir, key, lstm.part_width(key), aargs.buckets_dir, aargs.target) part_learned_mse[key] = learned_mse part_fixed_mse[key] = fixed_mse else: learned_mse, fixed_mse = generate_buckets_grouping(lstm, aargs.states_dir, aargs.grouping, aargs.buckets_dir, aargs.target) part_learned_mse = learned_mse part_fixed_mse = fixed_mse with open(os.path.join(aargs.buckets_dir, "analysis.csv"), "w") as fh: writer = csv_writer(fh) writer.writerow(["technique", "key", "mse"]) total_learned = 0.0 total_fixed = 0.0 count_learned = 0 count_fixed = 0 for key, error in sorted(part_learned_mse.items()): total_learned += error count_learned += 1 writer.writerow(["learned", key, "%f" % error]) for key, error in sorted(part_fixed_mse.items()): total_fixed += error count_fixed += 1 writer.writerow(["fixed", key, "%f" % error]) user_log.info("Total scores (learned, fixed): %s, %s" % (total_learned / count_learned, total_fixed / count_fixed)) return 0
def main(argv): ap = ArgumentParser(prog="analyze-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") #ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("--train-data", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) stats = {} for key in lstm.keys(): train_points, test_points = states.get_hidden_states( aargs.states_dir, key) if aargs.train_data: stats[key] = calculate_stats(train_points) else: stats[key] = calculate_stats(test_points) writer = csv_writer(sys.stdout) writer.writerow(["key"] + sorted(stats[next(iter(lstm.keys()))].keys())) averages = {} count = 0 for key, stats in sorted(stats.items()): count += 1 writer.writerow([key] + [item[1] for item in sorted(stats.items())]) for key, value in stats.items(): if key not in averages: averages[key] = 0 averages[key] += value writer.writerow(["global"] + [item[1] / count for item in sorted(averages.items())]) return 0
def main(argv): ap = ArgumentParser(prog="snow-patrol daemon") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("config_path") ap.add_argument("--dry-run", action="store_true", default=False) aargs = ap.parse_args(argv) log_file = ".%s.%s.log" % (os.path.splitext( os.path.basename(__file__))[0], os.path.basename(aargs.config_path)) setup_logging(log_file, aargs.verbose, False, True, True) config = model.load(aargs.config_path) logging.debug("Running under: %s" % config) run_continuously(config, aargs.dry_run) return 0
def main(argv): ap = ArgumentParser(prog="server") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("-p", "--port", default=8888, type=int) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) run_server(aargs.port, "api", "resources", { "echo": Echo(), "echo/echo": Echo(), }) return 0
def main(argv): ap = ArgumentParser(prog="generate-query-database") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--key-offsets", nargs="*", default=None) ap.add_argument("--db-kind", choices=["sqlite", "postgres"]) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("activation_dir") ap.add_argument("query_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) threads = [] for parameter in (lstm.keys() if aargs.key_offsets is None else aargs.key_offsets): key = parameter if aargs.key_offsets is None else parameter.split( "#")[0] offset = 0 if aargs.key_offsets is None else int( parameter.split("#")[1]) thread = threading.Thread(target=generate_db, args=[ lstm, aargs.activation_dir, key, aargs.query_dir, aargs.db_kind, offset ]) # Non-daemon threads will keep the program running until they finish (as per documentation). thread.daemon = False thread.start() threads += [thread] for thread in threads: thread.join() return 0
def main(): ap = ArgumentParser(prog="termnet") ap.add_argument("--verbose", "-v", default=False, action="store_true", # Unfortunately, logging in python 2.7 doesn't have # a built-in way to log asynchronously. help="Turn on verbose logging. " + \ "**This will SIGNIFICANTLY slow down the program.**") ap.add_argument("-f", "--input-format", default=workbench.parser.WIKIPEDIA, help="One of %s" % workbench.parser.FORMATS) ap.add_argument("-w", "--window", default=0) ap.add_argument("input_texts", nargs="+") args = ap.parse_args() setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], args.verbose, True) logging.debug(args) net = build(args.input_texts, args.input_format, args.window) return 0
def main(argv): ap = ArgumentParser(prog="generate-hidden-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("-s", "--sample-rate", type=float, default=0.1, help="train then test sampling rates.") ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) if aargs.dry_run: dry_run(data.stream_data(aargs.data_dir, aargs.kind), aargs.sample_rate, aargs.kind) return 0 lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) description = data.get_description(aargs.data_dir) if description.task == data.LM: annotation_fn = lambda y, i: y[i][0] else: annotation_fn = lambda y, i: y elicit_hidden_states(lstm, data.stream_data(aargs.data_dir, aargs.kind), annotation_fn, aargs.sample_rate, aargs.states_dir, aargs.kind) return 0
def main(argv): ap = ArgumentParser(prog="language-model") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("--corpus", default="corpus.txt") ap.add_argument("--epochs", default=100, type=int) args = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], args.verbose, False, True) words, xy_sequences, neural_network = domain.create( args.corpus, args.epochs, args.verbose) #while neural_network.is_setting_up(): # pass neural_network._background_training.join() accuracy = neural_network.lstm.test( [[rnn.Xy(t[0], t[1]) for t in sequence] for sequence in xy_sequences], True) user_log.info("accuracy: %s" % accuracy)
def main(argv): ap = ArgumentParser(prog="generate-sequential-model") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") #ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("-l", "--layers", default=2, type=int) ap.add_argument("-w", "--width", default=100, type=int) ap.add_argument("-e", "--embedding-width", default=50, type=int) ap.add_argument("--srnn", default=False, action="store_true", help="use the 'srnn' ablation") ap.add_argument("--out", default=False, action="store_true", help="use the 'out' ablation") ap.add_argument("-b", "--batch", default=32, type=int) ap.add_argument("-a", "--arc-epochs", default=5, type=int) ap.add_argument("-i", "--initial-decays", default=5, type=int) ap.add_argument("-c", "--convergence-decays", default=2, type=int) ap.add_argument("data_dir") ap.add_argument("sequential_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) hyper_parameters = sequential.HyperParameters(aargs.layers, aargs.width, aargs.embedding_width) ablations = sequential.Ablations(aargs.srnn, aargs.out) rnn = generate_rnn(aargs.data_dir, hyper_parameters, ablations, aargs.batch, aargs.arc_epochs, aargs.initial_decays, aargs.convergence_decays, aargs.sequential_dir) return 0
def main(argv): ap = ArgumentParser(prog="generate-activation-states") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") #ap.add_argument("-d", "--dry-run", default=False, action="store_true") ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("activations_dir") ap.add_argument("kind", choices=["train", "validation", "test"]) aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir) description = data.get_description(aargs.data_dir) elicit_activation_states(lstm, data.stream_data(aargs.data_dir, aargs.kind), aargs.activations_dir) return 0
def main(argv): ap = ArgumentParser(prog="generate-semantic-model") ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.") ap.add_argument("-i", "--initial-decays", default=5, type=int) ap.add_argument("-c", "--convergence-decays", default=2, type=int) ap.add_argument("-a", "--arc-epochs", default=3, type=int) ap.add_argument("-l", "--layers", default=2, type=int) ap.add_argument("-w", "--width", default=100, type=int) ap.add_argument("--word-input", default=False, action="store_true") ap.add_argument("-p", "--pre-existing", default=False, action="store_true") ap.add_argument("-m", "--monolith", default=False, action="store_true") ap.add_argument("--key-set", nargs="*", default=None) ap.add_argument("data_dir") ap.add_argument("sequential_dir") ap.add_argument("states_dir") ap.add_argument("encoding_dir") aargs = ap.parse_args(argv) setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True) logging.debug(aargs) lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True) user_log.info("Sem") hyper_parameters = model.HyperParameters(aargs.layers, aargs.width) extra = { "word_input": aargs.word_input, "monolith": aargs.monolith, } if aargs.pre_existing: sem = load_sem(lstm, aargs.encoding_dir) else: sem = generate_sem(lstm, hyper_parameters, extra, aargs.states_dir, aargs.arc_epochs, aargs.encoding_dir, aargs.key_set, aargs.initial_decays, aargs.convergence_decays) keys_sem, total_sem = test_model(lstm, sem, aargs.states_dir, False, aargs.key_set) # TODO #user_log.info("Baseline") #baseline = generate_baseline(aargs.data_dir, lstm, hyper_parameters, extra) #scores_baseline, totals_baseline = test_model(lstm, baseline, aargs.states_dir, True, aargs.key_set) with open(os.path.join(aargs.encoding_dir, "analysis-breakdown.csv"), "w") as fh: writer = csv_writer(fh) writer.writerow(["technique", "key", "perplexity"]) for key, perplexity in sorted(keys_sem.items()): writer.writerow(["sem", key, "%f" % perplexity]) #for key, scores in sorted(scores_baseline.items()): # for name, score in sorted(scores.items()): # writer.writerow(["baseline", key, name, "%f" % score]) with open(os.path.join(aargs.encoding_dir, "analysis-totals.csv"), "w") as fh: writer = csv_writer(fh) writer.writerow(["technique", "perplexity"]) writer.writerow(["sem", "%f" % total_sem]) #for name, score in sorted(totals_baseline.items()): # writer.writerow(["baseline", name, "%f" % score]) return 0
import os from ml import nn as ffnn from ml import base as mlbase from pytils import adjutant from pytils.log import setup_logging, user_log setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], True, False, True) KINDS = ["outputs", "cells"] LAYERS = 2 WIDTH = 5 words = set(["abc", "def", "ghi"]) kind_labels = mlbase.Labels(set(KINDS)) layer_labels = mlbase.Labels(set(range(LAYERS))) activation_vector = mlbase.VectorField(WIDTH) predictor_input = mlbase.ConcatField( [kind_labels, layer_labels, activation_vector]) predictor_output = mlbase.Labels(words) predictor = ffnn.Model("predictor", ffnn.HyperParameters().width(10).layers(1), predictor_input, predictor_output, mlbase.SINGLE_LABEL) data = [ mlbase.Xy(("outputs", 0, [.1, .2, .3, .4, .5]), { "abc": .6, "def": .2, "ghi": .2 }), mlbase.Xy(("outputs", 1, [.1, .2, .3, .4, .5]), {