Beispiel #1
0
def get_corpora(sspaths):
    """Find corpus names"""
    result = set()
    for setting in generate_settings(sspaths):
        corpus = os.path.splitext(
            os.path.basename(utils.get_pickle_name(setting)))[0]
        if corpus not in result:
            result.add(corpus)
    return result
Beispiel #2
0
def get_corpora(sspaths):
    """Find corpus names"""
    result = set()
    for setting in generate_settings(sspaths):
        corpus = os.path.splitext(
            os.path.basename(
                utils.get_pickle_name(setting)))[0]
        if corpus not in result:
            result.add(corpus)
    return result
Beispiel #3
0
def pickle_data(hosts, settings, working_dir, outputdir):
    picklings = set()
    work = set()
    for s in settings:
        pickle_name = utils.get_pickle_name(s)
        if pickle_name not in picklings:
            picklings.add(pickle_name)
            work.add(s)
    lock = threading.Lock()
    threads = []
    for h in set(hosts):
        t = PickleThread(h, working_dir, work, outputdir, lock)
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
Beispiel #4
0
def pickle_data(hosts, sspaths, working_dir, outputdir):
    """Pickle all corpora needed"""
    picklings = set()
    work = set()
    for setting in generate_settings(sspaths):
        pickle_name = utils.get_pickle_name(setting)
        if pickle_name not in picklings:
            picklings.add(pickle_name)
            work.add(setting)
    lock = threading.Lock()
    threads = []
    for host in set(hosts):
        thread = PickleThread(host, working_dir, work, outputdir, lock)
        threads.append(thread)
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
Beispiel #5
0
def pickle_data(hosts, sspaths, working_dir, outputdir):
    """Pickle all corpora needed"""
    picklings = set()
    work = set()
    for setting in generate_settings(sspaths):
        pickle_name = utils.get_pickle_name(setting)
        if pickle_name not in picklings:
            picklings.add(pickle_name)
            work.add(setting)
    lock = threading.Lock()
    threads = []
    for host in set(hosts):
        thread = PickleThread(host, working_dir, work, outputdir, lock)
        threads.append(thread)
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
Beispiel #6
0
def _run():
    parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings['labels'])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end-start)
    with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh:
        ofh.write('# import time: {:s}\n'.format(str(import_time)))
Beispiel #7
0
def _run():
    parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings['labels'])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end-start)
    with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh:
        ofh.write('# import time: {:s}\n'.format(str(import_time)))
Beispiel #8
0
def _run():
    parser = argparse.ArgumentParser(description="Pickler of ActiveTM datasets")
    parser.add_argument(
        "settings",
        help="""the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory""",
    )
    parser.add_argument("outputdir", help="directory for output")
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings["labels"])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), "wb") as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end - start)
    with open(os.path.join(args.outputdir, pickle_name + "_import.time"), "w") as ofh:
        ofh.write("# import time: {:s}\n".format(str(import_time)))
Beispiel #9
0
def _run():
    """Run experiment"""
    parser = argparse.ArgumentParser(description='Job runner for ActiveTM '
            'experiments')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    parser.add_argument('label', help='identifying label')
    parser.add_argument('seed', default=-1, type=int, nargs='?')
    args = parser.parse_args()
    # print('Parsed arguments')

    settings = utils.parse_settings(args.settings)
    # print('Parsed settings')
    trueoutputdir = os.path.join(args.outputdir, settings['group'])
    if not os.path.exists(trueoutputdir):
        try:
            os.makedirs(trueoutputdir)
        except OSError:
            pass
    # print('Ensured true output directory exists')
    filename = socket.gethostname()+'.'+str(os.getpid())
    runningfile = os.path.join(args.outputdir, 'running',
            filename)
    try:
        with open(runningfile, 'w') as outputfh:
            outputfh.write('running')
        # print('Created running mark')

        start = time.time()
        input_pickle = os.path.join(args.outputdir, utils.get_pickle_name(args.settings))
        with open(input_pickle, 'rb') as ifh:
            dataset = pickle.load(ifh)
        # print('Got pickle')
        if args.seed == -1:
            rng = random.Random(int(settings['seed']))
        else:
            rng = random.Random(args.seed)
        # print('Set random seed: ', args.seed)
        model = models.build(rng, settings)
        # print('Built model')
        test_doc_ids, labeled_doc_ids, unlabeled_doc_ids =\
                partition_data_ids(dataset.num_docs, rng, settings)
        test_labels = []
        test_words = []
        for t in test_doc_ids:
            test_labels.append(dataset.labels[dataset.titles[t]])
            test_words.append(dataset.doc_tokens(t))
        test_labels_mean = np.mean(test_labels)
        known_labels = []
        for t in labeled_doc_ids:
            known_labels.append(dataset.labels[dataset.titles[t]])
        # print('Set up initial sets')

        SELECT_METHOD = select.factory[settings['select']]
        END_LABELED = int(settings['endlabeled'])
        LABEL_INCREMENT = int(settings['increment'])
        CAND_SIZE = int(settings['candsize'])
        results = []
        end = time.time()
        init_time = datetime.timedelta(seconds=end-start)

        start = time.time()
        # sandt = select_and_train
        sandt_start = time.time()
        model.train(dataset, labeled_doc_ids, known_labels)
        # print('Trained model')
        sandt_end = time.time()
        count = 0
        predictions = evaluate.get_predictions(model, test_words)
        pr2 = evaluate.pR2(predictions,
                           test_labels,
                           test_labels_mean)
        maes = evaluate.mean_absolute_errors(predictions, test_labels)
        np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count),
                   maes)
        results.append([len(labeled_doc_ids),
                datetime.timedelta(seconds=time.time()-start).total_seconds(),
                datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(),
                pr2])
        while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0:
            count += 1
            sandt_start = time.time()
            # must make unlabeled_doc_ids (which is a set) into a list
            candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE)
            chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model,
                    rng, LABEL_INCREMENT)
            for c in chosen:
                known_labels.append(dataset.labels[dataset.titles[c]])
                labeled_doc_ids.append(c)
                unlabeled_doc_ids.remove(c)
            model.train(dataset, labeled_doc_ids, known_labels, True)
            sandt_end = time.time()
            predictions = evaluate.get_predictions(model, test_words)
            pr2 = evaluate.pR2(predictions, test_labels, test_labels_mean)
            maes = evaluate.mean_absolute_errors(predictions, test_labels)
            np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count),
                       maes)
            results.append([len(labeled_doc_ids),
                    datetime.timedelta(seconds=time.time()-start).total_seconds(),
                    datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(),
                    pr2])
        model.cleanup()

        output = []
        output.append('# init time: {:s}'.format(str(init_time)))
        for result in results:
            output.append('\t'.join([str(r) for r in result]))
        output.append('')
        with open(os.path.join(trueoutputdir, args.label), 'w') as ofh:
            ofh.write('\n'.join(output))
    finally:
        os.remove(runningfile)
Beispiel #10
0
            (ankura.pipeline.filter_stopwords, settings['stopwords']),
            (ankura.pipeline.filter_rarewords, int(settings['rare'])),
            (ankura.pipeline.filter_commonwords, int(settings['common'])),
            (ankura.pipeline.filter_smalldocs, int(settings['smalldoc']))])
    if settings['pregenerate'] == 'YES':
        PIPELINE.append((ankura.pipeline.pregenerate_doc_tokens))
    return ankura.pipeline.run_pipeline(PIPELINE)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets')
    parser.add_argument('settings', help=\
            '''the path to a file containing settings, as described in \
            README.md in the root ActiveTM directory''')
    parser.add_argument('outputdir', help='directory for output')
    args = parser.parse_args()

    start = time.time()
    settings = utils.parse_settings(args.settings)
    pickle_name = utils.get_pickle_name(args.settings)
    if not os.path.exists(os.path.join(args.outputdir, pickle_name)):
        pre_dataset = get_dataset(settings)
        labels = labeled.get_labels(settings['labels'])
        dataset = labeled.LabeledDataset(pre_dataset, labels)
        with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh:
            pickle.dump(dataset, ofh)
    end = time.time()
    import_time = datetime.timedelta(seconds=end-start)
    with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh:
        ofh.write('# import time: {:s}\n'.format(str(import_time)))

Beispiel #11
0
    settings = utils.parse_settings(args.settings)
    trueoutputdir = os.path.join(args.outputdir, settings['group'])
    if not os.path.exists(trueoutputdir):
        try:
            os.makedirs(trueoutputdir)
        except OSError:
            pass
    filename = socket.gethostname()+'.'+str(os.getpid())
    runningfile = os.path.join(args.outputdir, 'running',
            filename)
    try:
        with open(runningfile, 'w') as outputfh:
            outputfh.write('running')

        start = time.time()
        input_pickle = os.path.join(args.outputdir, utils.get_pickle_name(args.settings))
        with open(input_pickle, 'rb') as ifh:
            dataset = pickle.load(ifh)
        rng = random.Random(int(settings['seed']))
        model = models.build(rng, settings)
        test_doc_ids, labeled_doc_ids, unlabeled_doc_ids =\
                partition_data_ids(dataset.num_docs, rng, settings)
        test_labels = []
        test_words = []
        for t in test_doc_ids:
            test_labels.append(dataset.labels[dataset.titles[t]])
            test_words.append(dataset.doc_tokens(t))
        test_labels_mean = np.mean(test_labels)
        known_labels = []
        for t in labeled_doc_ids:
            known_labels.append(dataset.labels[dataset.titles[t]])