def main(task_id, ensemble_dir, performance_range_threshold, ensemble_size,
         max_keep_best, seed, only_portfolio_runs, call_from_cmd):

    if max_keep_best > 1:
        assert max_keep_best == int(max_keep_best)
        max_keep_best = int(max_keep_best)

    memory_limit = 4000
    precision = 32
    metric = make_scorer('balanced_accuracy_fast', BalancedAccuracy())

    if not os.path.exists(ensemble_dir):
        raise NotADirectoryError("%s does not exist")
    if call_from_cmd:
        assert str(task_id) in ensemble_dir

    fl_name = "ensemble_results_%fthresh_%dsize_%fbest" % \
              (performance_range_threshold, ensemble_size, max_keep_best)
    if only_portfolio_runs:
        fl_name += "_only_portfolio"
    fl_name = os.path.join(ensemble_dir, fl_name)
    if os.path.isfile(fl_name):
        raise ValueError("Nothing left to do, %s already exists" % fl_name)

    # figure out how many prediction files are in dir
    if call_from_cmd:
        pred_dir = os.path.join(ensemble_dir, "auto-sklearn-output",
                                ".auto-sklearn", "predictions_ensemble")
        n_models = glob.glob(pred_dir +
                             "/predictions_ensemble_%d_*.npy.gz" % seed)
    else:
        pred_dir = os.path.join(ensemble_dir, ".auto-sklearn",
                                "predictions_ensemble")
        n_models = glob.glob(pred_dir +
                             "/predictions_ensemble_%d_*.npy" % seed)
    n_models.sort(key=lambda x: int(float(x.split("_")[-2])))
    print("\n".join(n_models))
    print("Found %d ensemble predictions" % len(n_models))
    if len(n_models) == 0:
        raise ValueError("%s has no ensemble predictions" % pred_dir)

    # Get start time of ensemble building: 1) load json 2) find key 3) get creation times
    if call_from_cmd:
        timestamps_fl = os.path.join(ensemble_dir, "auto-sklearn-output",
                                     "timestamps.json")
    else:
        timestamps_fl = os.path.join(ensemble_dir, "timestamps.json")
    with open(timestamps_fl, "r") as fh:
        timestamps = json.load(fh)
    model_timestamps = None
    overall_start_time = None
    for k in timestamps:
        if "predictions_ensemble" in k:
            model_timestamps = timestamps[k]
        if "start_time_%d" % seed in timestamps[k]:
            overall_start_time = timestamps[k]["start_time_%d" % seed]
    timestamp_keys = list(model_timestamps.keys())
    for timestamp_key in timestamp_keys:
        if timestamp_key.endswith(
                'lock') or 'predictions_ensemble' not in timestamp_key:
            del model_timestamps[timestamp_key]
    assert model_timestamps is not None and overall_start_time is not None
    assert len(model_timestamps) == len(n_models), (len(model_timestamps),
                                                    len(n_models))
    # Get overall timelimit
    vanilla_results_fl = os.path.join(ensemble_dir, "result.json")
    with open(vanilla_results_fl, "r") as fh:
        vanilla_results = json.load(fh)

    # If only portfolio configurations, read runhistory
    if only_portfolio_runs:
        if call_from_cmd:
            runhistory_fl = os.path.join(ensemble_dir, "auto-sklearn-output",
                                         "smac3-output", "run*",
                                         "runhistory.json")
        else:
            runhistory_fl = os.path.join(ensemble_dir, "smac3-output", "run*",
                                         "runhistory.json")
        runhistory_fl = glob.glob(runhistory_fl)
        assert len(runhistory_fl) == 1
        with open(runhistory_fl[0], "r") as fh:
            runhistory = json.load(fh)

        init_design_num_runs = []
        for i in runhistory["data"]:
            if i[1][3]["configuration_origin"] == "Initial design":
                if "error" in i[1][3]:
                    continue
                init_design_num_runs.append(i[1][3]["num_run"])
        print("Portfolio stopped after %s runs" % str(init_design_num_runs))
        last_run = max(init_design_num_runs)
        print("Cut down to only portfolio runs fom %d" % len(n_models))
        for i, n in enumerate(n_models):
            if int(float(n.split("_")[-2])) > last_run:
                n_models = n_models[:i]
                break
        print("... to %d" % len(n_models))

    # load data
    X_train, y_train, X_test, y_test, cat = load_task(task_id)

    if len(np.unique(y_test)) == 2:
        task_type = BINARY_CLASSIFICATION
    elif len(np.unique(y_test)) > 2:
        task_type = MULTICLASS_CLASSIFICATION
    else:
        raise ValueError("Unknown task type for task %d" % task_id)

    tmp_dir = tempfile.TemporaryDirectory()
    loss_trajectory = []

    # Construct ensemble builder
    context = BackendContextMock(
        temporary_directory=(ensemble_dir + "/auto-sklearn-output/"
                             if call_from_cmd else ensemble_dir),
        output_directory=tmp_dir.name,
        delete_tmp_folder_after_terminate=False,
        delete_output_folder_after_terminate=False,
        shared_mode=False)
    backend = Backend(context)

    ens_builder = EnsembleBuilder(
        backend=backend,
        dataset_name=str(task_id),
        task_type=task_type,
        metric=metric,
        limit=np.inf,
        ensemble_size=ensemble_size,
        ensemble_nbest=max_keep_best,
        performance_range_threshold=performance_range_threshold,
        max_models_on_disc=None,
        seed=seed,
        shared_mode=False,
        precision=precision,
        max_iterations=1,
        read_at_most=1,
        memory_limit=memory_limit,
        random_state=1,
        sleep_duration=0)

    try:
        # iterate over all models, take construction time into account when creating new trajectory
        current_ensemble_timestamp = 0
        skipped = 1
        for midx, model_path in enumerate(n_models):
            tstamp = model_timestamps[model_path.split("/")[-1].replace(
                '.gz', '')] - overall_start_time
            if current_ensemble_timestamp > tstamp:
                # while this model was built, the ensemble script was not yet done
                skipped += 1
                continue

            # Do one ensemble building step
            start = time.time()
            ens_builder.random_state = check_random_state(1)
            print("############## %d: Working on %s (skipped %d)" %
                  (midx + 1, model_path, skipped - 1))
            logging.basicConfig(level=logging.DEBUG)
            ens_builder.read_at_most = skipped
            valid_pred, test_pred = ens_builder.main(return_pred=True)
            last_dur = time.time() - start
            current_ensemble_timestamp = tstamp + last_dur

            if current_ensemble_timestamp >= vanilla_results["0"]["time_limit"]:
                print("############## Went over time %f > %f; Stop here" %
                      (current_ensemble_timestamp,
                       vanilla_results["0"]["time_limit"]))
                break

            # Reset, since we have just read model files
            skipped = 1
            if test_pred is None:
                # Adding this model did not change the ensemble, no new prediction
                continue
            if task_type == BINARY_CLASSIFICATION:
                # Recreate nx2 array
                test_pred = np.concatenate([
                    1 - test_pred.reshape([-1, 1]),
                    test_pred.reshape([-1, 1])
                ],
                                           axis=1)

            # Build trajectory entry
            score = 1 - balanced_accuracy(y_true=y_test, y_pred=test_pred)
            loss_trajectory.append((current_ensemble_timestamp, score))
            print("############## Round %d took %g sec" %
                  (midx, time.time() - start))
    except:
        raise
    finally:
        tmp_dir.cleanup()

    # Store results
    result = dict()
    result[ensemble_size] = {
        'task_id': task_id,
        'time_limit': vanilla_results["0"]["time_limit"],
        'loss': loss_trajectory[-1][1],
        'configuration': {
            "n_models": n_models,
            "performance_range_threshold": performance_range_threshold,
            "ensemble_size": ensemble_size,
            "max_keep_best": max_keep_best,
            "seed": seed,
            "memory_limit": memory_limit,
            "precision": precision,
        },
        'n_models': len(n_models),
        'trajectory': loss_trajectory,
    }

    with open(fl_name, 'wt') as fh:
        json.dump(result, fh, indent=4)
    print("Dumped to %s" % fl_name)
Example #2
0
import random
import torch
import torch.nn as nn
from tqdm import tqdm
from utils import load_task, make_word_vector, to_var, load_glove_weights, frobenius
from utils import save_pickle, load_pickle
from models import SelfAttentiveNet

data = load_task('./dataset/review.json')
# data = load_pickle('data.pickle')
# save_pickle(data, 'data.pickle')

vocab = set()
for review, _ in data:
    vocab |= set(review)

vocab = ['<PAD>'] + list(sorted(vocab))
w2i = dict((w, i) for i, w in enumerate(vocab, 0))
i2w = dict((i, w) for i, w in enumerate(vocab, 0))
print('vocab size', len(vocab))

n_dev = 2000
split_id = len(data) - n_dev
train_data = data[:split_id]
dev_data = data[split_id:]

n_epoch = 4
batch_size = 32
embd_size = 100
attn_hops = 60
Example #3
0
	taskFilename = sys.argv[1]
	vectorsFilename = sys.argv[2]
	pathToSVMFile = sys.argv[3]
	clusterFile = sys.argv[4]
	relFile = sys.argv[5]
	pathToExpansionCache = sys.argv[6]
	normalVectorsFile = sys.argv[7]

	expansion = 5
	window = 5
	svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window)
	expansionCacheInfo = "_expansionParam_"  + str(expansion)
	
	print "Loading rel, task, vector, words that have been disambiguated"
	rel = shelve.open(relFile)
	task, tralala = load_task(taskFilename)
	vectors = load_vectors(vectorsFilename)
	normalVectors = load_vectors(normalVectorsFile)
	disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)]

	print "Reading agglomerative cluster centers"
	clusterCenters = [getAverageWordRep(x, vectors) for x in read_sets(clusterFile)]

	print "Starting..."
	# initiate empty ratings
	methodsRating = []
	humanRating = []
	questions = task.values()

	jointVocCache = dict()
	partVoc = set(vectors.keys())
Example #4
0
    vectorsFilename = sys.argv[2]
    pathToSVMFile = sys.argv[3]
    clusterFile = sys.argv[4]
    relFile = sys.argv[5]
    pathToExpansionCache = sys.argv[6]
    normalVectorsFile = sys.argv[7]

    expansion = 5
    window = 5
    svmFileInfo = '_SVM_' + clusterFile.split(
        '/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window)
    expansionCacheInfo = "_expansionParam_" + str(expansion)

    print "Loading rel, task, vector, words that have been disambiguated"
    rel = shelve.open(relFile)
    task, tralala = load_task(taskFilename)
    vectors = load_vectors(vectorsFilename)
    normalVectors = load_vectors(normalVectorsFile)
    disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)]

    print "Reading agglomerative cluster centers"
    clusterCenters = [
        getAverageWordRep(x, vectors) for x in read_sets(clusterFile)
    ]

    print "Starting..."
    # initiate empty ratings
    methodsRating = []
    humanRating = []
    questions = task.values()
Example #5
0
	print "Loading stuf..."
	taskFilename = sys.argv[1]
	filename = sys.argv[2] # "../../../cluster_descriptors/enwiki8.clust-desc.shelve"
	vectorsFilename = sys.argv[3]
	vectors = load_vectors(vectorsFilename)

	d = shelve.open(filename)
	key_sets = []
	newD = dict()
	vec_size = len(d.keys())

	for i in xrange(vec_size):
		key_sets.append(set(d[str(i)].keys()))
		newD[i] = d[str(i)]

	task, _ = load_task(taskFilename)
	questions = task.values()

	methodsRating = []
	humanRating = []

	print "Answering", len(task), "questions..."

	for i in xrange(len(questions)):
		if i % 100 == 0 and not i == 0:
			print "\tIteration", i, ": ", spearman(methodsRating, humanRating)

		question = questions[i]

		word1 = Word(question['word1']).lemma()
		word2 = Word(question['word2']).lemma()
import sys
from utils import load_task, Word
from fast_utils import cosine_similarity, load_vectors, spearman

if __name__ == "__main__":
	print "Baseline with wordvectors"
	if len(sys.argv) < 3:
		print "USAGE: python baselline_word2vec.py <PATH TO TASK> <PATH TO WORDVECTORS>"
		sys.exit()
	taskFilename = sys.argv[1]
	vectorsFilename = sys.argv[2]

	task, _ = load_task(taskFilename)
	vectors = load_vectors(vectorsFilename)

	methodsRating = []
	humanRating = []

	questions = task.values()
	coverage = 0

	for i in xrange(len(questions)):
		question = questions[i]

		word1 = Word(question['word1']).lemma()
		word2 = Word(question['word2']).lemma()

		if word1 in vectors and word2 in vectors:
			vec1 = vectors[word1]
			vec2 = vectors[word2]
			methodsRating.append(cosine_similarity(vec1, vec2))
create_symlink = args.create_symlink
only_check_stats_file = args.only_check_stats_file
max_runtime_limit = args.max_runtime_limit
disable_fallback = args.disable_fallback

with open(selector_file, 'rb') as fh:
    selector_dict = pickle.load(fh)

selector = selector_dict['selector']
methods_to_choose_from = selector_dict['methods_to_choose_from']
methods_information = selector_dict['methods_information']
if disable_fallback:
    if hasattr(selector, 'default_strategy_idx'):
        selector.default_strategy_idx = None

X_train, y_train, _, _, _ = load_task(task_id)
min_num_samples_per_class = np.min(np.unique(y_train, return_counts=True)[1])

meta_features = compute_meta_features(X_train, y_train)
del X_train
del y_train
meta_features = np.array([
    meta_features['NumberOfClasses'], meta_features['NumberOfFeatures'],
    meta_features['NumberOfInstances']
])

#meta_features = get_meta_features(
#        task_id, '/home/eggenspk/PoSHAutosklearn/2020_IEEE_Autosklearn_experiments/experiment_scripts/60MIN/AutoAuto_build_more_metafeatures/metafeatures/')
#meta_features = pd.Series(meta_features)
#for name in ['NumberOfClasses',
#     'NumberOfFeatures',
Example #8
0
EVALUATION_INTERVAL = 1
GLOVE_PATH = '/home/crli/crli/glove.840B.300d.txt'
DROPOUT = 0.3
LAYER_NUM = 3
FIXED_EMBEDDING_NUM = 1000
SPAN_LENGTH = 15
KERNEL_SIZES=[2,3]


TRAIN_PATH = './squad/train-v1.1.json'
DEV_PATH = './squad/dev-v1.1.json'
path_to_predictions= './predict/predict_result'
path_to_dev = DEV_PATH

if options.first_load_data:
    train_exs,dev_exs = utils.load_task(TRAIN_PATH,DEV_PATH)
    with open(options.train_data,'wb') as file:
        pickle.dump(train_exs,file)
    with open(options.dev_data,'wb') as file:
        pickle.dump(dev_exs,file)
else:
    with open(options.train_data,'rb') as file:
        train_exs = pickle.load(file)
    with open(options.dev_data,'rb') as file:
        dev_exs = pickle.load(file)
            

print('The size of train_set:',len(train_exs))
print('The size of dev_set:',len(dev_exs))
TRAIN_SIZE = len(train_exs)
DEV_SIZE = len(dev_exs)
Example #9
0
    # open the vectors
    print "Loading vectors"
    vecs = load_vectors(vecFile)

    # read clusters and get their cluster centers by taking the average...
    print "Reading agglomerative cluster centers"
    agglomerativeClusterCenters = [
        getAverageWordRep(x, vecs) for x in read_sets(clusterFile)
    ]

    # set some parameters
    expansion = 5
    window = 5

    # get the words that occur in the task and need to be compared
    _, wordsToSplit = load_task(pathToTask)

    indexCache = dict()

    wordsToSplit = filter(lambda x: x not in alreadyDisambiguatedWords,
                          wordsToSplit)
    total = len(wordsToSplit)

    for i, word in enumerate(wordsToSplit):
        # progess
        print "Working on word ", word, i, " / ", total
        mySVM, availableSVM, expansionCache = getSVM(
            word,
            read_file(textfile),
            rel,
            vecs,
Example #10
0
	rel = shelve.open(relFile)
	
	# open the vectors
	print "Loading vectors"
	vecs = load_vectors(vecFile)
	
	# read clusters and get their cluster centers by taking the average...
	print "Reading agglomerative cluster centers"
	agglomerativeClusterCenters = [getAverageWordRep(x, vecs) for x in read_sets(clusterFile)]
	
	# set some parameters
	expansion = 5
	window = 5
	
	# get the words that occur in the task and need to be compared
	_, wordsToSplit = load_task(pathToTask)
	
	indexCache = dict()

	wordsToSplit = filter(lambda x: x not in alreadyDisambiguatedWords, wordsToSplit)
	total = len(wordsToSplit)

	for i, word in enumerate(wordsToSplit):
		# progess
		print "Working on word ", word, i, " / ", total
		mySVM, availableSVM, expansionCache = getSVM(word, read_file(textfile), rel, vecs, agglomerativeClusterCenters, indexCache, expansionParam=expansion, skipsize=window)
		# if we found an svm
		if availableSVM:
			# dump the svm
			pickle.dump(mySVM, open(pathToSVMFile + word + '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window), 'w'))
			# open expansioncache shelve object
Example #11
0
if os.path.exists(tmp_dir):
    print('Output directory %s already exists - no need to run this again!' % tmp_dir)
    exit(0)
else:
    os.makedirs(tmp_dir)
    software_stats_file = os.path.join(tmp_dir, 'software.txt')
    with open(software_stats_file, 'wt') as fh:
        fh.write(strio.read())
    argparser_content  = {}
    for key, value in vars(args).items():
        argparser_content[key] = value
    argparser_content_file = os.path.join(tmp_dir, 'arguments.json')
    with open(argparser_content_file, 'wt') as fh:
        json.dump(argparser_content, fh, indent=4)

X_train, y_train, X_test, y_test, cat = load_task(task_id)

iterative_wo_early_stopping = ['extra_trees', 'PassiveAggressiveWOEarlyStopping', 'random_forest',
                               'SGDWOEarlyStopping', 'GradientBoostingClassifierWOEarlyStopping']
iterative_w_early_stopping = ['extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'gradient_boosting']

if not early_stopping:
    add_classifier_wo_early_stopping()

if searchspace == "iterative":
    include_estimator = iterative_w_early_stopping if early_stopping else iterative_wo_early_stopping
    include_preprocessor = ["no_preprocessing", ]
elif searchspace == "iterative-preproc":
    include_estimator = iterative_w_early_stopping if early_stopping else iterative_wo_early_stopping
    include_preprocessor = None
elif searchspace == "full":