def get_corpora(sspaths): """Find corpus names""" result = set() for setting in generate_settings(sspaths): corpus = os.path.splitext( os.path.basename(utils.get_pickle_name(setting)))[0] if corpus not in result: result.add(corpus) return result
def get_corpora(sspaths): """Find corpus names""" result = set() for setting in generate_settings(sspaths): corpus = os.path.splitext( os.path.basename( utils.get_pickle_name(setting)))[0] if corpus not in result: result.add(corpus) return result
def pickle_data(hosts, settings, working_dir, outputdir): picklings = set() work = set() for s in settings: pickle_name = utils.get_pickle_name(s) if pickle_name not in picklings: picklings.add(pickle_name) work.add(s) lock = threading.Lock() threads = [] for h in set(hosts): t = PickleThread(h, working_dir, work, outputdir, lock) threads.append(t) for t in threads: t.start() for t in threads: t.join()
def pickle_data(hosts, sspaths, working_dir, outputdir): """Pickle all corpora needed""" picklings = set() work = set() for setting in generate_settings(sspaths): pickle_name = utils.get_pickle_name(setting) if pickle_name not in picklings: picklings.add(pickle_name) work.add(setting) lock = threading.Lock() threads = [] for host in set(hosts): thread = PickleThread(host, working_dir, work, outputdir, lock) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join()
def _run(): parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets') parser.add_argument('settings', help=\ '''the path to a file containing settings, as described in \ README.md in the root ActiveTM directory''') parser.add_argument('outputdir', help='directory for output') args = parser.parse_args() start = time.time() settings = utils.parse_settings(args.settings) pickle_name = utils.get_pickle_name(args.settings) if not os.path.exists(os.path.join(args.outputdir, pickle_name)): pre_dataset = get_dataset(settings) labels = labeled.get_labels(settings['labels']) dataset = labeled.LabeledDataset(pre_dataset, labels) with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh: pickle.dump(dataset, ofh) end = time.time() import_time = datetime.timedelta(seconds=end-start) with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh: ofh.write('# import time: {:s}\n'.format(str(import_time)))
def _run(): parser = argparse.ArgumentParser(description="Pickler of ActiveTM datasets") parser.add_argument( "settings", help="""the path to a file containing settings, as described in \ README.md in the root ActiveTM directory""", ) parser.add_argument("outputdir", help="directory for output") args = parser.parse_args() start = time.time() settings = utils.parse_settings(args.settings) pickle_name = utils.get_pickle_name(args.settings) if not os.path.exists(os.path.join(args.outputdir, pickle_name)): pre_dataset = get_dataset(settings) labels = labeled.get_labels(settings["labels"]) dataset = labeled.LabeledDataset(pre_dataset, labels) with open(os.path.join(args.outputdir, pickle_name), "wb") as ofh: pickle.dump(dataset, ofh) end = time.time() import_time = datetime.timedelta(seconds=end - start) with open(os.path.join(args.outputdir, pickle_name + "_import.time"), "w") as ofh: ofh.write("# import time: {:s}\n".format(str(import_time)))
def _run(): """Run experiment""" parser = argparse.ArgumentParser(description='Job runner for ActiveTM ' 'experiments') parser.add_argument('settings', help=\ '''the path to a file containing settings, as described in \ README.md in the root ActiveTM directory''') parser.add_argument('outputdir', help='directory for output') parser.add_argument('label', help='identifying label') parser.add_argument('seed', default=-1, type=int, nargs='?') args = parser.parse_args() # print('Parsed arguments') settings = utils.parse_settings(args.settings) # print('Parsed settings') trueoutputdir = os.path.join(args.outputdir, settings['group']) if not os.path.exists(trueoutputdir): try: os.makedirs(trueoutputdir) except OSError: pass # print('Ensured true output directory exists') filename = socket.gethostname()+'.'+str(os.getpid()) runningfile = os.path.join(args.outputdir, 'running', filename) try: with open(runningfile, 'w') as outputfh: outputfh.write('running') # print('Created running mark') start = time.time() input_pickle = os.path.join(args.outputdir, utils.get_pickle_name(args.settings)) with open(input_pickle, 'rb') as ifh: dataset = pickle.load(ifh) # print('Got pickle') if args.seed == -1: rng = random.Random(int(settings['seed'])) else: rng = random.Random(args.seed) # print('Set random seed: ', args.seed) model = models.build(rng, settings) # print('Built model') test_doc_ids, labeled_doc_ids, unlabeled_doc_ids =\ partition_data_ids(dataset.num_docs, rng, settings) test_labels = [] test_words = [] for t in test_doc_ids: test_labels.append(dataset.labels[dataset.titles[t]]) test_words.append(dataset.doc_tokens(t)) test_labels_mean = np.mean(test_labels) known_labels = [] for t in labeled_doc_ids: known_labels.append(dataset.labels[dataset.titles[t]]) # print('Set up initial sets') SELECT_METHOD = select.factory[settings['select']] END_LABELED = int(settings['endlabeled']) LABEL_INCREMENT = int(settings['increment']) CAND_SIZE = int(settings['candsize']) results = [] end = time.time() init_time = datetime.timedelta(seconds=end-start) start = time.time() # sandt = select_and_train sandt_start = time.time() model.train(dataset, labeled_doc_ids, known_labels) # print('Trained model') sandt_end = time.time() count = 0 predictions = evaluate.get_predictions(model, test_words) pr2 = evaluate.pR2(predictions, test_labels, test_labels_mean) maes = evaluate.mean_absolute_errors(predictions, test_labels) np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count), maes) results.append([len(labeled_doc_ids), datetime.timedelta(seconds=time.time()-start).total_seconds(), datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(), pr2]) while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0: count += 1 sandt_start = time.time() # must make unlabeled_doc_ids (which is a set) into a list candidates = select.reservoir(list(unlabeled_doc_ids), rng, CAND_SIZE) chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT) for c in chosen: known_labels.append(dataset.labels[dataset.titles[c]]) labeled_doc_ids.append(c) unlabeled_doc_ids.remove(c) model.train(dataset, labeled_doc_ids, known_labels, True) sandt_end = time.time() predictions = evaluate.get_predictions(model, test_words) pr2 = evaluate.pR2(predictions, test_labels, test_labels_mean) maes = evaluate.mean_absolute_errors(predictions, test_labels) np.savetxt(utils.get_mae_out_name(trueoutputdir, args.label, count), maes) results.append([len(labeled_doc_ids), datetime.timedelta(seconds=time.time()-start).total_seconds(), datetime.timedelta(seconds=sandt_end-sandt_start).total_seconds(), pr2]) model.cleanup() output = [] output.append('# init time: {:s}'.format(str(init_time))) for result in results: output.append('\t'.join([str(r) for r in result])) output.append('') with open(os.path.join(trueoutputdir, args.label), 'w') as ofh: ofh.write('\n'.join(output)) finally: os.remove(runningfile)
(ankura.pipeline.filter_stopwords, settings['stopwords']), (ankura.pipeline.filter_rarewords, int(settings['rare'])), (ankura.pipeline.filter_commonwords, int(settings['common'])), (ankura.pipeline.filter_smalldocs, int(settings['smalldoc']))]) if settings['pregenerate'] == 'YES': PIPELINE.append((ankura.pipeline.pregenerate_doc_tokens)) return ankura.pipeline.run_pipeline(PIPELINE) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Pickler of ActiveTM datasets') parser.add_argument('settings', help=\ '''the path to a file containing settings, as described in \ README.md in the root ActiveTM directory''') parser.add_argument('outputdir', help='directory for output') args = parser.parse_args() start = time.time() settings = utils.parse_settings(args.settings) pickle_name = utils.get_pickle_name(args.settings) if not os.path.exists(os.path.join(args.outputdir, pickle_name)): pre_dataset = get_dataset(settings) labels = labeled.get_labels(settings['labels']) dataset = labeled.LabeledDataset(pre_dataset, labels) with open(os.path.join(args.outputdir, pickle_name), 'wb') as ofh: pickle.dump(dataset, ofh) end = time.time() import_time = datetime.timedelta(seconds=end-start) with open(os.path.join(args.outputdir, pickle_name+'_import.time'), 'w') as ofh: ofh.write('# import time: {:s}\n'.format(str(import_time)))
settings = utils.parse_settings(args.settings) trueoutputdir = os.path.join(args.outputdir, settings['group']) if not os.path.exists(trueoutputdir): try: os.makedirs(trueoutputdir) except OSError: pass filename = socket.gethostname()+'.'+str(os.getpid()) runningfile = os.path.join(args.outputdir, 'running', filename) try: with open(runningfile, 'w') as outputfh: outputfh.write('running') start = time.time() input_pickle = os.path.join(args.outputdir, utils.get_pickle_name(args.settings)) with open(input_pickle, 'rb') as ifh: dataset = pickle.load(ifh) rng = random.Random(int(settings['seed'])) model = models.build(rng, settings) test_doc_ids, labeled_doc_ids, unlabeled_doc_ids =\ partition_data_ids(dataset.num_docs, rng, settings) test_labels = [] test_words = [] for t in test_doc_ids: test_labels.append(dataset.labels[dataset.titles[t]]) test_words.append(dataset.doc_tokens(t)) test_labels_mean = np.mean(test_labels) known_labels = [] for t in labeled_doc_ids: known_labels.append(dataset.labels[dataset.titles[t]])