Exemple #1
0
def get_doc():
    """Gets the next document for this user"""
    uid = str(flask.request.headers.get('uuid'))
    doc_number = -1
    document = ''
    predicted_label = BASE_LABEL
    uncertainty = BASE_UNCERTAINTY
    if uid not in MODELS:
        train_model(uid)
    with LOCK:
        if uid in USER_DICT:
            # do what we need to get the right document for this user
            labeled_doc_ids = USER_DICT[uid]['labeled_doc_ids']
            unlabeled_doc_ids = USER_DICT[uid]['unlabeled_doc_ids']
            candidates = select.reservoir(unlabeled_doc_ids, RNG, CAND_SIZE)
            doc_number = SELECT_METHOD(DATASET, labeled_doc_ids, candidates,
                        MODELS[uid], RNG, LABEL_INCREMENT)[0] 
            document = DATASET.doc_metadata(doc_number, 'text')
            USER_DICT[uid]['current_doc'] = doc_number
            if (len(labeled_doc_ids) >= START_TRAINING and
            USER_DICT[uid]['training_complete'] is True):
                doc = DATASET.doc_tokens(doc_number)
                predicted_label = MODELS[uid].predict(doc)
                uncertainty = MODELS[uid].get_uncertainty(doc)
    save_state()
    return flask.jsonify(document=document, doc_number=doc_number,
                         predicted_label=predicted_label,
                         uncertainty=uncertainty)
Exemple #2
0
def get_doc():
    """Get the next document for this user"""
    uid = str(flask.request.headers.get('uuid'))
    doc_number = -1
    document = ''
    predicted_label_x = BASE_LABEL
    uncertainty_x = BASE_UNCERTAINTY
    predicted_label_y = BASE_LABEL
    uncertainty_y = BASE_UNCERTAINTY
    if uid not in MODELS:
        _train_model(uid)
    with LOCK:
        if uid in USER_DICT:
            # do what we need to get the right document for this user
            docs_with_labels = USER_DICT[uid]['docs_with_labels']
            unlabeled_doc_ids = USER_DICT[uid]['unlabeled_doc_ids']
            cand_set = select.reservoir(unlabeled_doc_ids, RNG, CAND_SIZE)
            # We are currently choosing based on what model 0 wants
            # TODO: Use both models' output to choose the next doc
            doc_number = SELECT_METHOD(DATASET, docs_with_labels.keys(),
                        cand_set, MODELS[uid][0], RNG, LABEL_INCREMENT)[0] 
            document = DATASET.doc_metadata(doc_number, 'text')
            doc_title = DATASET.titles[doc_number]
            USER_DICT[uid]['current_doc'] = doc_number
            if (len(docs_with_labels) >= START_TRAINING and
            USER_DICT[uid]['training_complete'] is True):
                doc = DATASET.doc_tokens(doc_number)
                predicted_label_x = MODELS[uid][0].predict(doc)
                uncertainty_x = MODELS[uid][0].get_uncertainty(doc)
                predicted_label_y = MODELS[uid][1].predict(doc)
                uncertainty_y = MODELS[uid][1].get_uncertainty(doc)
    _save_state()
    return flask.jsonify(document=document,
                         doc_number=doc_number,
                         doc_title=doc_title,
                         predicted_label_x=predicted_label_x,
                         uncertainty_x=uncertainty_x,
                         predicted_label_y=predicted_label_y,
                         uncertainty_y=uncertainty_y)
Exemple #3
0
def _run():
    """Run experiment"""
    parser = argparse.ArgumentParser(description='Job runner for ActiveTM '
            'experiments')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    parser.add_argument('label', help='identifying label')
    parser.add_argument('seed', default=-1, type=int, nargs='?')
    args = parser.parse_args()
    # print('Parsed arguments')

    settings = utils.parse_settings(args.settings)
    # print('Parsed settings')
    trueoutputdir = os.path.join(args.outputdir, settings['group'])
    if not os.path.exists(trueoutputdir):
        try:
            os.makedirs(trueoutputdir)
        except OSError:
            pass
    # print('Ensured true output directory exists')
    filename = socket.gethostname()+'.'+str(os.getpid())
    runningfile = os.path.join(args.outputdir, 'running',
            filename)
    try:
        with open(runningfile, 'w') as outputfh:
            outputfh.write('running')
        # print('Created running mark')

        start = time.time()
        input_pickle = os.path.join(args.outputdir, utils.get_pickle_name(args.settings))
        with open(input_pickle, 'rb') as ifh:
            dataset = pickle.load(ifh)
        # print('Got pickle')
        if args.seed == -1:
            rng = random.Random(int(settings['seed']))
        else:
            rng = random.Random(args.seed)
        # print('Set random seed: ', args.seed)
        model = models.build(rng, settings)
        # print('Built model')
        test_doc_ids, labeled_doc_ids, unlabeled_doc_ids =\
                partition_data_ids(dataset.num_docs, rng, settings)
        test_labels = []
        test_words = []
        for t in test_doc_ids:
            test_labels.append(dataset.labels[dataset.titles[t]])
            test_words.append(dataset.doc_tokens(t))
        test_labels_mean = np.mean(test_labels)
        known_labels = []
        for t in labeled_doc_ids:
            known_labels.append(dataset.labels[dataset.titles[t]])
        # print('Set up initial sets')

        SELECT_METHOD = select.factory[settings['select']]
        END_LABELED = int(settings['endlabeled'])
        LABEL_INCREMENT = int(settings['increment'])
        CAND_SIZE = int(settings['candsize'])
        results = []
        end = time.time()
        init_time = datetime.timedelta(seconds=end-start)

        start = time.time()
        # sandt = select_and_train
        sandt_start = time.time()
        model.train(dataset, labeled_doc_ids, known_labels)
        # print('Trained model')
        sandt_end = time.time()
        count = 0
        predictions = evaluate.get_predictions(model, test_words)
        pr2 = evaluate.pR2(predictions,
                           test_labels,
                           test_labels_mean)
        maes = evaluate.mean_absolute_errors(predictions, test_labels)
        np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count),
                   maes)
        results.append([len(labeled_doc_ids),
                datetime.timedelta(seconds=time.time()-start).total_seconds(),
                datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(),
                pr2])
        while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0:
            count += 1
            sandt_start = time.time()
            # must make unlabeled_doc_ids (which is a set) into a list
            candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE)
            chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model,
                    rng, LABEL_INCREMENT)
            for c in chosen:
                known_labels.append(dataset.labels[dataset.titles[c]])
                labeled_doc_ids.append(c)
                unlabeled_doc_ids.remove(c)
            model.train(dataset, labeled_doc_ids, known_labels, True)
            sandt_end = time.time()
            predictions = evaluate.get_predictions(model, test_words)
            pr2 = evaluate.pR2(predictions, test_labels, test_labels_mean)
            maes = evaluate.mean_absolute_errors(predictions, test_labels)
            np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count),
                       maes)
            results.append([len(labeled_doc_ids),
                    datetime.timedelta(seconds=time.time()-start).total_seconds(),
                    datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(),
                    pr2])
        model.cleanup()

        output = []
        output.append('# init time: {:s}'.format(str(init_time)))
        for result in results:
            output.append('\t'.join([str(r) for r in result]))
        output.append('')
        with open(os.path.join(trueoutputdir, args.label), 'w') as ofh:
            ofh.write('\n'.join(output))
    finally:
        os.remove(runningfile)
Exemple #4
0
def demo(C_SEED):
    """Runs a demo of active learning simulation with sLDA via sampling"""
    start = time.time()
    rng = random.Random(SEED)
    slda.set_seed(C_SEED)
    dataset = ankura.run_pipeline(PIPELINE)
    pre_labels = {}
    with open(SOTU_LABELS) as ifh:
        for line in ifh:
            data = line.strip().split()
            pre_labels[data[0]] = float(data[1])
    labels = []
    for doc_id in range(dataset.num_docs):
        labels.append(pre_labels[dataset.titles[doc_id]])
    end = time.time()
    print("Import took:", datetime.timedelta(seconds=end - start))
    print()

    start = time.time()

    # initialize sets
    shuffled_doc_ids = list(range(dataset.num_docs))
    rng.shuffle(shuffled_doc_ids)
    test_doc_ids = shuffled_doc_ids[:TEST_SIZE]
    test_labels = []
    test_words = []
    for t in test_doc_ids:
        test_labels.append(labels[t])
        test_words.append(dataset.doc_tokens(t))
    test_labels_mean = numpy.mean(test_labels)
    labeled_doc_ids = shuffled_doc_ids[TEST_SIZE : TEST_SIZE + START_LABELED]
    known_labels = []
    for t in labeled_doc_ids:
        known_labels.append(labels[t])
    unlabeled_doc_ids = set(shuffled_doc_ids[TEST_SIZE + START_LABELED :])

    model = slda.SamplingSLDA(
        rng,
        NUM_TOPICS,
        ALPHA,
        BETA,
        VAR,
        NUM_TRAIN,
        NUM_SAMPLES_TRAIN,
        TRAIN_BURN,
        TRAIN_LAG,
        NUM_SAMPLES_PREDICT,
        PREDICT_BURN,
        PREDICT_LAG,
    )

    # learning loop
    model.train(dataset, labeled_doc_ids, known_labels)
    metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean)
    print(len(labeled_doc_ids), metric, datetime.timedelta(seconds=time.time() - start))
    while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0:
        candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE)
        chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT)
        for c in chosen:
            known_labels.append(labels[c])
            labeled_doc_ids.append(c)
            unlabeled_doc_ids.remove(c)
        model.train(dataset, labeled_doc_ids, known_labels, True)
        metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean)
        print(len(labeled_doc_ids), metric, datetime.timedelta(seconds=time.time() - start))
    model.cleanup()
    end = time.time()
    print()
    print("Total simulation time:", datetime.timedelta(seconds=end - start))
    print()
Exemple #5
0
def demo(C_SEED):
    """Runs a demo of active learning simulation with sLDA via sampling"""
    start = time.time()
    rng = random.Random(SEED)
    slda.set_seed(C_SEED)
    dataset = ankura.run_pipeline(PIPELINE)
    pre_labels = {}
    with open(SOTU_LABELS) as ifh:
        for line in ifh:
            data = line.strip().split()
            pre_labels[data[0]] = float(data[1])
    labels = []
    for doc_id in range(dataset.num_docs):
        labels.append(pre_labels[dataset.titles[doc_id]])
    end = time.time()
    print('Import took:', datetime.timedelta(seconds=end-start))
    print()

    start = time.time()

    # initialize sets
    shuffled_doc_ids = list(range(dataset.num_docs))
    rng.shuffle(shuffled_doc_ids)
    test_doc_ids = shuffled_doc_ids[:TEST_SIZE]
    test_labels = []
    test_words = []
    for t in test_doc_ids:
        test_labels.append(labels[t])
        test_words.append(dataset.doc_tokens(t))
    test_labels_mean = numpy.mean(test_labels)
    labeled_doc_ids = shuffled_doc_ids[TEST_SIZE:TEST_SIZE+START_LABELED]
    known_labels = []
    for t in labeled_doc_ids:
        known_labels.append(labels[t])
    unlabeled_doc_ids = set(shuffled_doc_ids[TEST_SIZE+START_LABELED:])

    model = slda.SamplingSLDA(rng, NUM_TOPICS, ALPHA, BETA, VAR,
            NUM_TRAIN, NUM_SAMPLES_TRAIN, TRAIN_BURN, TRAIN_LAG,
            NUM_SAMPLES_PREDICT, PREDICT_BURN, PREDICT_LAG)

    # learning loop
    model.train(dataset, labeled_doc_ids, known_labels)
    metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean)
    print(len(labeled_doc_ids), metric,
            datetime.timedelta(seconds=time.time()-start))
    while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0:
        candidates = select.reservoir(list(unlabeled_doc_ids),
                rng, CAND_SIZE)
        chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT)
        for c in chosen:
            known_labels.append(labels[c])
            labeled_doc_ids.append(c)
            unlabeled_doc_ids.remove(c)
        model.train(dataset, labeled_doc_ids, known_labels, True)
        metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean)
        print(len(labeled_doc_ids), metric,
                datetime.timedelta(seconds=time.time()-start))
    model.cleanup()
    end = time.time()
    print()
    print('Total simulation time:', datetime.timedelta(seconds=end-start))
    print()
Exemple #6
0
        init_time = datetime.timedelta(seconds=end-start)

        start = time.time()
        select_and_train_start = time.time()
        model.train(dataset, labeled_doc_ids, known_labels)
        select_and_train_end = time.time()
        metric = evaluate.pR2(model, test_words, test_labels,
                test_labels_mean)
        results.append([len(labeled_doc_ids),
                datetime.timedelta(seconds=time.time()-start).total_seconds(),
                datetime.timedelta(seconds=select_and_train_end-select_and_train_start).total_seconds(),
                metric])
        while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0:
            select_and_train_start = time.time()
            # must make unlabeled_doc_ids (which is a set) into a list
            candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE)
            chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model,
                    rng, LABEL_INCREMENT)
            for c in chosen:
                known_labels.append(dataset.labels[dataset.titles[c]])
                labeled_doc_ids.append(c)
                unlabeled_doc_ids.remove(c)
            model.train(dataset, labeled_doc_ids, known_labels, True)
            select_and_train_end = time.time()
            metric = evaluate.pR2(model, test_words, test_labels,
                    test_labels_mean)
            results.append([len(labeled_doc_ids),
                    datetime.timedelta(seconds=time.time()-start).total_seconds(),
                    datetime.timedelta(seconds=select_and_train_end-select_and_train_start).total_seconds(),
                    metric])
        model.cleanup()