def get_doc(): """Gets the next document for this user""" uid = str(flask.request.headers.get('uuid')) doc_number = -1 document = '' predicted_label = BASE_LABEL uncertainty = BASE_UNCERTAINTY if uid not in MODELS: train_model(uid) with LOCK: if uid in USER_DICT: # do what we need to get the right document for this user labeled_doc_ids = USER_DICT[uid]['labeled_doc_ids'] unlabeled_doc_ids = USER_DICT[uid]['unlabeled_doc_ids'] candidates = select.reservoir(unlabeled_doc_ids, RNG, CAND_SIZE) doc_number = SELECT_METHOD(DATASET, labeled_doc_ids, candidates, MODELS[uid], RNG, LABEL_INCREMENT)[0] document = DATASET.doc_metadata(doc_number, 'text') USER_DICT[uid]['current_doc'] = doc_number if (len(labeled_doc_ids) >= START_TRAINING and USER_DICT[uid]['training_complete'] is True): doc = DATASET.doc_tokens(doc_number) predicted_label = MODELS[uid].predict(doc) uncertainty = MODELS[uid].get_uncertainty(doc) save_state() return flask.jsonify(document=document, doc_number=doc_number, predicted_label=predicted_label, uncertainty=uncertainty)
def get_doc(): """Get the next document for this user""" uid = str(flask.request.headers.get('uuid')) doc_number = -1 document = '' predicted_label_x = BASE_LABEL uncertainty_x = BASE_UNCERTAINTY predicted_label_y = BASE_LABEL uncertainty_y = BASE_UNCERTAINTY if uid not in MODELS: _train_model(uid) with LOCK: if uid in USER_DICT: # do what we need to get the right document for this user docs_with_labels = USER_DICT[uid]['docs_with_labels'] unlabeled_doc_ids = USER_DICT[uid]['unlabeled_doc_ids'] cand_set = select.reservoir(unlabeled_doc_ids, RNG, CAND_SIZE) # We are currently choosing based on what model 0 wants # TODO: Use both models' output to choose the next doc doc_number = SELECT_METHOD(DATASET, docs_with_labels.keys(), cand_set, MODELS[uid][0], RNG, LABEL_INCREMENT)[0] document = DATASET.doc_metadata(doc_number, 'text') doc_title = DATASET.titles[doc_number] USER_DICT[uid]['current_doc'] = doc_number if (len(docs_with_labels) >= START_TRAINING and USER_DICT[uid]['training_complete'] is True): doc = DATASET.doc_tokens(doc_number) predicted_label_x = MODELS[uid][0].predict(doc) uncertainty_x = MODELS[uid][0].get_uncertainty(doc) predicted_label_y = MODELS[uid][1].predict(doc) uncertainty_y = MODELS[uid][1].get_uncertainty(doc) _save_state() return flask.jsonify(document=document, doc_number=doc_number, doc_title=doc_title, predicted_label_x=predicted_label_x, uncertainty_x=uncertainty_x, predicted_label_y=predicted_label_y, uncertainty_y=uncertainty_y)
def _run(): """Run experiment""" parser = argparse.ArgumentParser(description='Job runner for ActiveTM ' 'experiments') parser.add_argument('settings', help=\ '''the path to a file containing settings, as described in \ README.md in the root ActiveTM directory''') parser.add_argument('outputdir', help='directory for output') parser.add_argument('label', help='identifying label') parser.add_argument('seed', default=-1, type=int, nargs='?') args = parser.parse_args() # print('Parsed arguments') settings = utils.parse_settings(args.settings) # print('Parsed settings') trueoutputdir = os.path.join(args.outputdir, settings['group']) if not os.path.exists(trueoutputdir): try: os.makedirs(trueoutputdir) except OSError: pass # print('Ensured true output directory exists') filename = socket.gethostname()+'.'+str(os.getpid()) runningfile = os.path.join(args.outputdir, 'running', filename) try: with open(runningfile, 'w') as outputfh: outputfh.write('running') # print('Created running mark') start = time.time() input_pickle = os.path.join(args.outputdir, utils.get_pickle_name(args.settings)) with open(input_pickle, 'rb') as ifh: dataset = pickle.load(ifh) # print('Got pickle') if args.seed == -1: rng = random.Random(int(settings['seed'])) else: rng = random.Random(args.seed) # print('Set random seed: ', args.seed) model = models.build(rng, settings) # print('Built model') test_doc_ids, labeled_doc_ids, unlabeled_doc_ids =\ partition_data_ids(dataset.num_docs, rng, settings) test_labels = [] test_words = [] for t in test_doc_ids: test_labels.append(dataset.labels[dataset.titles[t]]) test_words.append(dataset.doc_tokens(t)) test_labels_mean = np.mean(test_labels) known_labels = [] for t in labeled_doc_ids: known_labels.append(dataset.labels[dataset.titles[t]]) # print('Set up initial sets') SELECT_METHOD = select.factory[settings['select']] END_LABELED = int(settings['endlabeled']) LABEL_INCREMENT = int(settings['increment']) CAND_SIZE = int(settings['candsize']) results = [] end = time.time() init_time = datetime.timedelta(seconds=end-start) start = time.time() # sandt = select_and_train sandt_start = time.time() model.train(dataset, labeled_doc_ids, known_labels) # print('Trained model') sandt_end = time.time() count = 0 predictions = evaluate.get_predictions(model, test_words) pr2 = evaluate.pR2(predictions, test_labels, test_labels_mean) maes = evaluate.mean_absolute_errors(predictions, test_labels) np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count), maes) results.append([len(labeled_doc_ids), datetime.timedelta(seconds=time.time()-start).total_seconds(), datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(), pr2]) while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0: count += 1 sandt_start = time.time() # must make unlabeled_doc_ids (which is a set) into a list candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE) chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT) for c in chosen: known_labels.append(dataset.labels[dataset.titles[c]]) labeled_doc_ids.append(c) unlabeled_doc_ids.remove(c) model.train(dataset, labeled_doc_ids, known_labels, True) sandt_end = time.time() predictions = evaluate.get_predictions(model, test_words) pr2 = evaluate.pR2(predictions, test_labels, test_labels_mean) maes = evaluate.mean_absolute_errors(predictions, test_labels) np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count), maes) results.append([len(labeled_doc_ids), datetime.timedelta(seconds=time.time()-start).total_seconds(), datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(), pr2]) model.cleanup() output = [] output.append('# init time: {:s}'.format(str(init_time))) for result in results: output.append('\t'.join([str(r) for r in result])) output.append('') with open(os.path.join(trueoutputdir, args.label), 'w') as ofh: ofh.write('\n'.join(output)) finally: os.remove(runningfile)
def demo(C_SEED): """Runs a demo of active learning simulation with sLDA via sampling""" start = time.time() rng = random.Random(SEED) slda.set_seed(C_SEED) dataset = ankura.run_pipeline(PIPELINE) pre_labels = {} with open(SOTU_LABELS) as ifh: for line in ifh: data = line.strip().split() pre_labels[data[0]] = float(data[1]) labels = [] for doc_id in range(dataset.num_docs): labels.append(pre_labels[dataset.titles[doc_id]]) end = time.time() print("Import took:", datetime.timedelta(seconds=end - start)) print() start = time.time() # initialize sets shuffled_doc_ids = list(range(dataset.num_docs)) rng.shuffle(shuffled_doc_ids) test_doc_ids = shuffled_doc_ids[:TEST_SIZE] test_labels = [] test_words = [] for t in test_doc_ids: test_labels.append(labels[t]) test_words.append(dataset.doc_tokens(t)) test_labels_mean = numpy.mean(test_labels) labeled_doc_ids = shuffled_doc_ids[TEST_SIZE : TEST_SIZE + START_LABELED] known_labels = [] for t in labeled_doc_ids: known_labels.append(labels[t]) unlabeled_doc_ids = set(shuffled_doc_ids[TEST_SIZE + START_LABELED :]) model = slda.SamplingSLDA( rng, NUM_TOPICS, ALPHA, BETA, VAR, NUM_TRAIN, NUM_SAMPLES_TRAIN, TRAIN_BURN, TRAIN_LAG, NUM_SAMPLES_PREDICT, PREDICT_BURN, PREDICT_LAG, ) # learning loop model.train(dataset, labeled_doc_ids, known_labels) metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean) print(len(labeled_doc_ids), metric, datetime.timedelta(seconds=time.time() - start)) while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0: candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE) chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT) for c in chosen: known_labels.append(labels[c]) labeled_doc_ids.append(c) unlabeled_doc_ids.remove(c) model.train(dataset, labeled_doc_ids, known_labels, True) metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean) print(len(labeled_doc_ids), metric, datetime.timedelta(seconds=time.time() - start)) model.cleanup() end = time.time() print() print("Total simulation time:", datetime.timedelta(seconds=end - start)) print()
def demo(C_SEED): """Runs a demo of active learning simulation with sLDA via sampling""" start = time.time() rng = random.Random(SEED) slda.set_seed(C_SEED) dataset = ankura.run_pipeline(PIPELINE) pre_labels = {} with open(SOTU_LABELS) as ifh: for line in ifh: data = line.strip().split() pre_labels[data[0]] = float(data[1]) labels = [] for doc_id in range(dataset.num_docs): labels.append(pre_labels[dataset.titles[doc_id]]) end = time.time() print('Import took:', datetime.timedelta(seconds=end-start)) print() start = time.time() # initialize sets shuffled_doc_ids = list(range(dataset.num_docs)) rng.shuffle(shuffled_doc_ids) test_doc_ids = shuffled_doc_ids[:TEST_SIZE] test_labels = [] test_words = [] for t in test_doc_ids: test_labels.append(labels[t]) test_words.append(dataset.doc_tokens(t)) test_labels_mean = numpy.mean(test_labels) labeled_doc_ids = shuffled_doc_ids[TEST_SIZE:TEST_SIZE+START_LABELED] known_labels = [] for t in labeled_doc_ids: known_labels.append(labels[t]) unlabeled_doc_ids = set(shuffled_doc_ids[TEST_SIZE+START_LABELED:]) model = slda.SamplingSLDA(rng, NUM_TOPICS, ALPHA, BETA, VAR, NUM_TRAIN, NUM_SAMPLES_TRAIN, TRAIN_BURN, TRAIN_LAG, NUM_SAMPLES_PREDICT, PREDICT_BURN, PREDICT_LAG) # learning loop model.train(dataset, labeled_doc_ids, known_labels) metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean) print(len(labeled_doc_ids), metric, datetime.timedelta(seconds=time.time()-start)) while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0: candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE) chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT) for c in chosen: known_labels.append(labels[c]) labeled_doc_ids.append(c) unlabeled_doc_ids.remove(c) model.train(dataset, labeled_doc_ids, known_labels, True) metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean) print(len(labeled_doc_ids), metric, datetime.timedelta(seconds=time.time()-start)) model.cleanup() end = time.time() print() print('Total simulation time:', datetime.timedelta(seconds=end-start)) print()
init_time = datetime.timedelta(seconds=end-start) start = time.time() select_and_train_start = time.time() model.train(dataset, labeled_doc_ids, known_labels) select_and_train_end = time.time() metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean) results.append([len(labeled_doc_ids), datetime.timedelta(seconds=time.time()-start).total_seconds(), datetime.timedelta(seconds=select_and_train_end-select_and_train_start).total_seconds(), metric]) while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0: select_and_train_start = time.time() # must make unlabeled_doc_ids (which is a set) into a list candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE) chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT) for c in chosen: known_labels.append(dataset.labels[dataset.titles[c]]) labeled_doc_ids.append(c) unlabeled_doc_ids.remove(c) model.train(dataset, labeled_doc_ids, known_labels, True) select_and_train_end = time.time() metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean) results.append([len(labeled_doc_ids), datetime.timedelta(seconds=time.time()-start).total_seconds(), datetime.timedelta(seconds=select_and_train_end-select_and_train_start).total_seconds(), metric]) model.cleanup()