예제 #1
0
파일: main.py 프로젝트: cojoco/activetm
def get_groups(config):
    result = set()
    settings = generate_settings(config)
    for s in settings:
        d = utils.parse_settings(s)
        result.add(d['group'])
    return sorted(list(result))
예제 #2
0
파일: main.py 프로젝트: nOkuda/activetm
def get_groups(config):
    """Get groups (as specified in settings file)"""
    result = set()
    settings = generate_settings(config)
    for setting in settings:
        cur_settings = utils.parse_settings(setting)
        result.add(cur_settings['group'])
    return sorted(list(result))
예제 #3
0
def run():
    """plot results from given experiment"""
    args = parse_arguments()
    corpora = ['amazon', 'frus', 'sotu_broken', 'yelp']
    selections = ['random', 'top_topic', 'topic_comp']
    for corpus in corpora:
        main.make_plots(os.path.join(args.resultspath, corpus), selections,
                        utils.parse_settings(args.deltatxt))
예제 #4
0
def get_groups(config):
    """Get groups (as specified in settings file)"""
    result = set()
    settings = generate_settings(config)
    for setting in settings:
        cur_settings = utils.parse_settings(setting)
        result.add(cur_settings['group'])
    return sorted(list(result))
예제 #5
0
def run():
    """plot results from given experiment"""
    args = parse_arguments()
    corpora = [
        'amazon',
        'frus',
        'sotu_broken',
        'yelp'
    ]
    selections = [
        'random',
        'top_topic',
        'topic_comp'
    ]
    for corpus in corpora:
        main.make_plots(
            os.path.join(args.resultspath, corpus),
            selections,
            utils.parse_settings(args.deltatxt))
예제 #6
0
파일: main.py 프로젝트: nOkuda/activetm
def _run():
    """Run code"""
    args = _parse_args()
    try:
        begin_time = datetime.datetime.now()
        slack_notification('Starting job: '+args.outputdir)
        runningdir = os.path.join(args.outputdir, 'running')
        if os.path.exists(runningdir):
            shutil.rmtree(runningdir)
        try:
            os.makedirs(runningdir)
        except OSError:
            pass
        hosts = get_hosts(args.hosts)
        check_counts(hosts, utils.count_settings(args.config))
        if not os.path.exists(args.outputdir):
            logging.getLogger(__name__).error('Cannot write output to: '+args.outputdir)
            sys.exit(-1)
        groups = get_groups(args.config)
        pickle_data(hosts, args.config, args.working_dir, args.outputdir)
        run_jobs(
            hosts,
            args.config,
            args.working_dir,
            args.outputdir)
        corpora = get_corpora(args.config)
        for corpus in corpora:
            make_plots(
                os.path.join(args.outputdir, corpus),
                groups,
                utils.parse_settings(args.deltastxt))
        run_time = datetime.datetime.now() - begin_time
        with open(os.path.join(args.outputdir, 'run_time'), 'w') as ofh:
            ofh.write(str(run_time))
        os.rmdir(runningdir)
        slack_notification('Job complete: '+args.outputdir)
        if args.email:
            send_notification(args.email, args.outputdir, run_time)
    except:
        slack_notification('Job died: '+args.outputdir)
        raise
예제 #7
0
def _run():
    parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings['labels'])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end-start)
    with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh:
        ofh.write('# import time: {:s}\n'.format(str(import_time)))
예제 #8
0
def _run():
    parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings['labels'])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end-start)
    with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh:
        ofh.write('# import time: {:s}\n'.format(str(import_time)))
예제 #9
0
def _run():
    parser = argparse.ArgumentParser(description="Pickler of ActiveTM datasets")
    parser.add_argument(
        "settings",
        help="""the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory""",
    )
    parser.add_argument("outputdir", help="directory for output")
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings["labels"])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), "wb") as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end - start)
    with open(os.path.join(args.outputdir, pickle_name + "_import.time"), "w") as ofh:
        ofh.write("# import time: {:s}\n".format(str(import_time)))
예제 #10
0
def _run():
    """Run code"""
    args = _parse_args()
    try:
        begin_time = datetime.datetime.now()
        slack_notification('Starting job: ' + args.outputdir)
        runningdir = os.path.join(args.outputdir, 'running')
        if os.path.exists(runningdir):
            shutil.rmtree(runningdir)
        try:
            os.makedirs(runningdir)
        except OSError:
            pass
        hosts = get_hosts(args.hosts)
        check_counts(hosts, utils.count_settings(args.config))
        if not os.path.exists(args.outputdir):
            logging.getLogger(__name__).error('Cannot write output to: ' +
                                              args.outputdir)
            sys.exit(-1)
        groups = get_groups(args.config)
        pickle_data(hosts, args.config, args.working_dir, args.outputdir)
        run_jobs(hosts, args.config, args.working_dir, args.outputdir)
        corpora = get_corpora(args.config)
        for corpus in corpora:
            make_plots(os.path.join(args.outputdir, corpus), groups,
                       utils.parse_settings(args.deltastxt))
        run_time = datetime.datetime.now() - begin_time
        with open(os.path.join(args.outputdir, 'run_time'), 'w') as ofh:
            ofh.write(str(run_time))
        os.rmdir(runningdir)
        slack_notification('Job complete: ' + args.outputdir)
        if args.email:
            send_notification(args.email, args.outputdir, run_time)
    except:
        slack_notification('Job died: ' + args.outputdir)
        raise
예제 #11
0
def _run():
    """Run experiment"""
    parser = argparse.ArgumentParser(description='Job runner for ActiveTM '
            'experiments')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    parser.add_argument('label', help='identifying label')
    parser.add_argument('seed', default=-1, type=int, nargs='?')
    args = parser.parse_args()
    # print('Parsed arguments')

    settings = utils.parse_settings(args.settings)
    # print('Parsed settings')
    trueoutputdir = os.path.join(args.outputdir, settings['group'])
    if not os.path.exists(trueoutputdir):
        try:
            os.makedirs(trueoutputdir)
        except OSError:
            pass
    # print('Ensured true output directory exists')
    filename = socket.gethostname()+'.'+str(os.getpid())
    runningfile = os.path.join(args.outputdir, 'running',
            filename)
    try:
        with open(runningfile, 'w') as outputfh:
            outputfh.write('running')
        # print('Created running mark')

        start = time.time()
        input_pickle = os.path.join(args.outputdir, utils.get_pickle_name(args.settings))
        with open(input_pickle, 'rb') as ifh:
            dataset = pickle.load(ifh)
        # print('Got pickle')
        if args.seed == -1:
            rng = random.Random(int(settings['seed']))
        else:
            rng = random.Random(args.seed)
        # print('Set random seed: ', args.seed)
        model = models.build(rng, settings)
        # print('Built model')
        test_doc_ids, labeled_doc_ids, unlabeled_doc_ids =\
                partition_data_ids(dataset.num_docs, rng, settings)
        test_labels = []
        test_words = []
        for t in test_doc_ids:
            test_labels.append(dataset.labels[dataset.titles[t]])
            test_words.append(dataset.doc_tokens(t))
        test_labels_mean = np.mean(test_labels)
        known_labels = []
        for t in labeled_doc_ids:
            known_labels.append(dataset.labels[dataset.titles[t]])
        # print('Set up initial sets')

        SELECT_METHOD = select.factory[settings['select']]
        END_LABELED = int(settings['endlabeled'])
        LABEL_INCREMENT = int(settings['increment'])
        CAND_SIZE = int(settings['candsize'])
        results = []
        end = time.time()
        init_time = datetime.timedelta(seconds=end-start)

        start = time.time()
        # sandt = select_and_train
        sandt_start = time.time()
        model.train(dataset, labeled_doc_ids, known_labels)
        # print('Trained model')
        sandt_end = time.time()
        count = 0
        predictions = evaluate.get_predictions(model, test_words)
        pr2 = evaluate.pR2(predictions,
                           test_labels,
                           test_labels_mean)
        maes = evaluate.mean_absolute_errors(predictions, test_labels)
        np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count),
                   maes)
        results.append([len(labeled_doc_ids),
                datetime.timedelta(seconds=time.time()-start).total_seconds(),
                datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(),
                pr2])
        while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0:
            count += 1
            sandt_start = time.time()
            # must make unlabeled_doc_ids (which is a set) into a list
            candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE)
            chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model,
                    rng, LABEL_INCREMENT)
            for c in chosen:
                known_labels.append(dataset.labels[dataset.titles[c]])
                labeled_doc_ids.append(c)
                unlabeled_doc_ids.remove(c)
            model.train(dataset, labeled_doc_ids, known_labels, True)
            sandt_end = time.time()
            predictions = evaluate.get_predictions(model, test_words)
            pr2 = evaluate.pR2(predictions, test_labels, test_labels_mean)
            maes = evaluate.mean_absolute_errors(predictions, test_labels)
            np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count),
                       maes)
            results.append([len(labeled_doc_ids),
                    datetime.timedelta(seconds=time.time()-start).total_seconds(),
                    datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(),
                    pr2])
        model.cleanup()

        output = []
        output.append('# init time: {:s}'.format(str(init_time)))
        for result in results:
            output.append('\t'.join([str(r) for r in result]))
        output.append('')
        with open(os.path.join(trueoutputdir, args.label), 'w') as ofh:
            ofh.write('\n'.join(output))
    finally:
        os.remove(runningfile)
예제 #12
0
            (ankura.pipeline.filter_stopwords, settings['stopwords']),
            (ankura.pipeline.filter_rarewords, int(settings['rare'])),
            (ankura.pipeline.filter_commonwords, int(settings['common'])),
            (ankura.pipeline.filter_smalldocs, int(settings['smalldoc']))])
    if settings['pregenerate'] == 'YES':
        PIPELINE.append((ankura.pipeline.pregenerate_doc_tokens))
    return ankura.pipeline.run_pipeline(PIPELINE)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings['labels'])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end-start)
    with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh:
        ofh.write('# import time: {:s}\n'.format(str(import_time)))