def main(task_id, ensemble_dir, performance_range_threshold, ensemble_size, max_keep_best, seed, only_portfolio_runs, call_from_cmd): if max_keep_best > 1: assert max_keep_best == int(max_keep_best) max_keep_best = int(max_keep_best) memory_limit = 4000 precision = 32 metric = make_scorer('balanced_accuracy_fast', BalancedAccuracy()) if not os.path.exists(ensemble_dir): raise NotADirectoryError("%s does not exist") if call_from_cmd: assert str(task_id) in ensemble_dir fl_name = "ensemble_results_%fthresh_%dsize_%fbest" % \ (performance_range_threshold, ensemble_size, max_keep_best) if only_portfolio_runs: fl_name += "_only_portfolio" fl_name = os.path.join(ensemble_dir, fl_name) if os.path.isfile(fl_name): raise ValueError("Nothing left to do, %s already exists" % fl_name) # figure out how many prediction files are in dir if call_from_cmd: pred_dir = os.path.join(ensemble_dir, "auto-sklearn-output", ".auto-sklearn", "predictions_ensemble") n_models = glob.glob(pred_dir + "/predictions_ensemble_%d_*.npy.gz" % seed) else: pred_dir = os.path.join(ensemble_dir, ".auto-sklearn", "predictions_ensemble") n_models = glob.glob(pred_dir + "/predictions_ensemble_%d_*.npy" % seed) n_models.sort(key=lambda x: int(float(x.split("_")[-2]))) print("\n".join(n_models)) print("Found %d ensemble predictions" % len(n_models)) if len(n_models) == 0: raise ValueError("%s has no ensemble predictions" % pred_dir) # Get start time of ensemble building: 1) load json 2) find key 3) get creation times if call_from_cmd: timestamps_fl = os.path.join(ensemble_dir, "auto-sklearn-output", "timestamps.json") else: timestamps_fl = os.path.join(ensemble_dir, "timestamps.json") with open(timestamps_fl, "r") as fh: timestamps = json.load(fh) model_timestamps = None overall_start_time = None for k in timestamps: if "predictions_ensemble" in k: model_timestamps = timestamps[k] if "start_time_%d" % seed in timestamps[k]: overall_start_time = timestamps[k]["start_time_%d" % seed] timestamp_keys = list(model_timestamps.keys()) for timestamp_key in timestamp_keys: if timestamp_key.endswith( 'lock') or 'predictions_ensemble' not in timestamp_key: del model_timestamps[timestamp_key] assert model_timestamps is not None and overall_start_time is not None assert len(model_timestamps) == len(n_models), (len(model_timestamps), len(n_models)) # Get overall timelimit vanilla_results_fl = os.path.join(ensemble_dir, "result.json") with open(vanilla_results_fl, "r") as fh: vanilla_results = json.load(fh) # If only portfolio configurations, read runhistory if only_portfolio_runs: if call_from_cmd: runhistory_fl = os.path.join(ensemble_dir, "auto-sklearn-output", "smac3-output", "run*", "runhistory.json") else: runhistory_fl = os.path.join(ensemble_dir, "smac3-output", "run*", "runhistory.json") runhistory_fl = glob.glob(runhistory_fl) assert len(runhistory_fl) == 1 with open(runhistory_fl[0], "r") as fh: runhistory = json.load(fh) init_design_num_runs = [] for i in runhistory["data"]: if i[1][3]["configuration_origin"] == "Initial design": if "error" in i[1][3]: continue init_design_num_runs.append(i[1][3]["num_run"]) print("Portfolio stopped after %s runs" % str(init_design_num_runs)) last_run = max(init_design_num_runs) print("Cut down to only portfolio runs fom %d" % len(n_models)) for i, n in enumerate(n_models): if int(float(n.split("_")[-2])) > last_run: n_models = n_models[:i] break print("... to %d" % len(n_models)) # load data X_train, y_train, X_test, y_test, cat = load_task(task_id) if len(np.unique(y_test)) == 2: task_type = BINARY_CLASSIFICATION elif len(np.unique(y_test)) > 2: task_type = MULTICLASS_CLASSIFICATION else: raise ValueError("Unknown task type for task %d" % task_id) tmp_dir = tempfile.TemporaryDirectory() loss_trajectory = [] # Construct ensemble builder context = BackendContextMock( temporary_directory=(ensemble_dir + "/auto-sklearn-output/" if call_from_cmd else ensemble_dir), output_directory=tmp_dir.name, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False) backend = Backend(context) ens_builder = EnsembleBuilder( backend=backend, dataset_name=str(task_id), task_type=task_type, metric=metric, limit=np.inf, ensemble_size=ensemble_size, ensemble_nbest=max_keep_best, performance_range_threshold=performance_range_threshold, max_models_on_disc=None, seed=seed, shared_mode=False, precision=precision, max_iterations=1, read_at_most=1, memory_limit=memory_limit, random_state=1, sleep_duration=0) try: # iterate over all models, take construction time into account when creating new trajectory current_ensemble_timestamp = 0 skipped = 1 for midx, model_path in enumerate(n_models): tstamp = model_timestamps[model_path.split("/")[-1].replace( '.gz', '')] - overall_start_time if current_ensemble_timestamp > tstamp: # while this model was built, the ensemble script was not yet done skipped += 1 continue # Do one ensemble building step start = time.time() ens_builder.random_state = check_random_state(1) print("############## %d: Working on %s (skipped %d)" % (midx + 1, model_path, skipped - 1)) logging.basicConfig(level=logging.DEBUG) ens_builder.read_at_most = skipped valid_pred, test_pred = ens_builder.main(return_pred=True) last_dur = time.time() - start current_ensemble_timestamp = tstamp + last_dur if current_ensemble_timestamp >= vanilla_results["0"]["time_limit"]: print("############## Went over time %f > %f; Stop here" % (current_ensemble_timestamp, vanilla_results["0"]["time_limit"])) break # Reset, since we have just read model files skipped = 1 if test_pred is None: # Adding this model did not change the ensemble, no new prediction continue if task_type == BINARY_CLASSIFICATION: # Recreate nx2 array test_pred = np.concatenate([ 1 - test_pred.reshape([-1, 1]), test_pred.reshape([-1, 1]) ], axis=1) # Build trajectory entry score = 1 - balanced_accuracy(y_true=y_test, y_pred=test_pred) loss_trajectory.append((current_ensemble_timestamp, score)) print("############## Round %d took %g sec" % (midx, time.time() - start)) except: raise finally: tmp_dir.cleanup() # Store results result = dict() result[ensemble_size] = { 'task_id': task_id, 'time_limit': vanilla_results["0"]["time_limit"], 'loss': loss_trajectory[-1][1], 'configuration': { "n_models": n_models, "performance_range_threshold": performance_range_threshold, "ensemble_size": ensemble_size, "max_keep_best": max_keep_best, "seed": seed, "memory_limit": memory_limit, "precision": precision, }, 'n_models': len(n_models), 'trajectory': loss_trajectory, } with open(fl_name, 'wt') as fh: json.dump(result, fh, indent=4) print("Dumped to %s" % fl_name)
import random import torch import torch.nn as nn from tqdm import tqdm from utils import load_task, make_word_vector, to_var, load_glove_weights, frobenius from utils import save_pickle, load_pickle from models import SelfAttentiveNet data = load_task('./dataset/review.json') # data = load_pickle('data.pickle') # save_pickle(data, 'data.pickle') vocab = set() for review, _ in data: vocab |= set(review) vocab = ['<PAD>'] + list(sorted(vocab)) w2i = dict((w, i) for i, w in enumerate(vocab, 0)) i2w = dict((i, w) for i, w in enumerate(vocab, 0)) print('vocab size', len(vocab)) n_dev = 2000 split_id = len(data) - n_dev train_data = data[:split_id] dev_data = data[split_id:] n_epoch = 4 batch_size = 32 embd_size = 100 attn_hops = 60
taskFilename = sys.argv[1] vectorsFilename = sys.argv[2] pathToSVMFile = sys.argv[3] clusterFile = sys.argv[4] relFile = sys.argv[5] pathToExpansionCache = sys.argv[6] normalVectorsFile = sys.argv[7] expansion = 5 window = 5 svmFileInfo = '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window) expansionCacheInfo = "_expansionParam_" + str(expansion) print "Loading rel, task, vector, words that have been disambiguated" rel = shelve.open(relFile) task, tralala = load_task(taskFilename) vectors = load_vectors(vectorsFilename) normalVectors = load_vectors(normalVectorsFile) disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)] print "Reading agglomerative cluster centers" clusterCenters = [getAverageWordRep(x, vectors) for x in read_sets(clusterFile)] print "Starting..." # initiate empty ratings methodsRating = [] humanRating = [] questions = task.values() jointVocCache = dict() partVoc = set(vectors.keys())
vectorsFilename = sys.argv[2] pathToSVMFile = sys.argv[3] clusterFile = sys.argv[4] relFile = sys.argv[5] pathToExpansionCache = sys.argv[6] normalVectorsFile = sys.argv[7] expansion = 5 window = 5 svmFileInfo = '_SVM_' + clusterFile.split( '/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window) expansionCacheInfo = "_expansionParam_" + str(expansion) print "Loading rel, task, vector, words that have been disambiguated" rel = shelve.open(relFile) task, tralala = load_task(taskFilename) vectors = load_vectors(vectorsFilename) normalVectors = load_vectors(normalVectorsFile) disambiguatedWords = [x.split("_")[0] for x in os.listdir(pathToSVMFile)] print "Reading agglomerative cluster centers" clusterCenters = [ getAverageWordRep(x, vectors) for x in read_sets(clusterFile) ] print "Starting..." # initiate empty ratings methodsRating = [] humanRating = [] questions = task.values()
print "Loading stuf..." taskFilename = sys.argv[1] filename = sys.argv[2] # "../../../cluster_descriptors/enwiki8.clust-desc.shelve" vectorsFilename = sys.argv[3] vectors = load_vectors(vectorsFilename) d = shelve.open(filename) key_sets = [] newD = dict() vec_size = len(d.keys()) for i in xrange(vec_size): key_sets.append(set(d[str(i)].keys())) newD[i] = d[str(i)] task, _ = load_task(taskFilename) questions = task.values() methodsRating = [] humanRating = [] print "Answering", len(task), "questions..." for i in xrange(len(questions)): if i % 100 == 0 and not i == 0: print "\tIteration", i, ": ", spearman(methodsRating, humanRating) question = questions[i] word1 = Word(question['word1']).lemma() word2 = Word(question['word2']).lemma()
import sys from utils import load_task, Word from fast_utils import cosine_similarity, load_vectors, spearman if __name__ == "__main__": print "Baseline with wordvectors" if len(sys.argv) < 3: print "USAGE: python baselline_word2vec.py <PATH TO TASK> <PATH TO WORDVECTORS>" sys.exit() taskFilename = sys.argv[1] vectorsFilename = sys.argv[2] task, _ = load_task(taskFilename) vectors = load_vectors(vectorsFilename) methodsRating = [] humanRating = [] questions = task.values() coverage = 0 for i in xrange(len(questions)): question = questions[i] word1 = Word(question['word1']).lemma() word2 = Word(question['word2']).lemma() if word1 in vectors and word2 in vectors: vec1 = vectors[word1] vec2 = vectors[word2] methodsRating.append(cosine_similarity(vec1, vec2))
create_symlink = args.create_symlink only_check_stats_file = args.only_check_stats_file max_runtime_limit = args.max_runtime_limit disable_fallback = args.disable_fallback with open(selector_file, 'rb') as fh: selector_dict = pickle.load(fh) selector = selector_dict['selector'] methods_to_choose_from = selector_dict['methods_to_choose_from'] methods_information = selector_dict['methods_information'] if disable_fallback: if hasattr(selector, 'default_strategy_idx'): selector.default_strategy_idx = None X_train, y_train, _, _, _ = load_task(task_id) min_num_samples_per_class = np.min(np.unique(y_train, return_counts=True)[1]) meta_features = compute_meta_features(X_train, y_train) del X_train del y_train meta_features = np.array([ meta_features['NumberOfClasses'], meta_features['NumberOfFeatures'], meta_features['NumberOfInstances'] ]) #meta_features = get_meta_features( # task_id, '/home/eggenspk/PoSHAutosklearn/2020_IEEE_Autosklearn_experiments/experiment_scripts/60MIN/AutoAuto_build_more_metafeatures/metafeatures/') #meta_features = pd.Series(meta_features) #for name in ['NumberOfClasses', # 'NumberOfFeatures',
EVALUATION_INTERVAL = 1 GLOVE_PATH = '/home/crli/crli/glove.840B.300d.txt' DROPOUT = 0.3 LAYER_NUM = 3 FIXED_EMBEDDING_NUM = 1000 SPAN_LENGTH = 15 KERNEL_SIZES=[2,3] TRAIN_PATH = './squad/train-v1.1.json' DEV_PATH = './squad/dev-v1.1.json' path_to_predictions= './predict/predict_result' path_to_dev = DEV_PATH if options.first_load_data: train_exs,dev_exs = utils.load_task(TRAIN_PATH,DEV_PATH) with open(options.train_data,'wb') as file: pickle.dump(train_exs,file) with open(options.dev_data,'wb') as file: pickle.dump(dev_exs,file) else: with open(options.train_data,'rb') as file: train_exs = pickle.load(file) with open(options.dev_data,'rb') as file: dev_exs = pickle.load(file) print('The size of train_set:',len(train_exs)) print('The size of dev_set:',len(dev_exs)) TRAIN_SIZE = len(train_exs) DEV_SIZE = len(dev_exs)
# open the vectors print "Loading vectors" vecs = load_vectors(vecFile) # read clusters and get their cluster centers by taking the average... print "Reading agglomerative cluster centers" agglomerativeClusterCenters = [ getAverageWordRep(x, vecs) for x in read_sets(clusterFile) ] # set some parameters expansion = 5 window = 5 # get the words that occur in the task and need to be compared _, wordsToSplit = load_task(pathToTask) indexCache = dict() wordsToSplit = filter(lambda x: x not in alreadyDisambiguatedWords, wordsToSplit) total = len(wordsToSplit) for i, word in enumerate(wordsToSplit): # progess print "Working on word ", word, i, " / ", total mySVM, availableSVM, expansionCache = getSVM( word, read_file(textfile), rel, vecs,
rel = shelve.open(relFile) # open the vectors print "Loading vectors" vecs = load_vectors(vecFile) # read clusters and get their cluster centers by taking the average... print "Reading agglomerative cluster centers" agglomerativeClusterCenters = [getAverageWordRep(x, vecs) for x in read_sets(clusterFile)] # set some parameters expansion = 5 window = 5 # get the words that occur in the task and need to be compared _, wordsToSplit = load_task(pathToTask) indexCache = dict() wordsToSplit = filter(lambda x: x not in alreadyDisambiguatedWords, wordsToSplit) total = len(wordsToSplit) for i, word in enumerate(wordsToSplit): # progess print "Working on word ", word, i, " / ", total mySVM, availableSVM, expansionCache = getSVM(word, read_file(textfile), rel, vecs, agglomerativeClusterCenters, indexCache, expansionParam=expansion, skipsize=window) # if we found an svm if availableSVM: # dump the svm pickle.dump(mySVM, open(pathToSVMFile + word + '_SVM_' + clusterFile.split('/')[-1] + "_expansionParam" + str(expansion) + "_window" + str(window), 'w')) # open expansioncache shelve object
if os.path.exists(tmp_dir): print('Output directory %s already exists - no need to run this again!' % tmp_dir) exit(0) else: os.makedirs(tmp_dir) software_stats_file = os.path.join(tmp_dir, 'software.txt') with open(software_stats_file, 'wt') as fh: fh.write(strio.read()) argparser_content = {} for key, value in vars(args).items(): argparser_content[key] = value argparser_content_file = os.path.join(tmp_dir, 'arguments.json') with open(argparser_content_file, 'wt') as fh: json.dump(argparser_content, fh, indent=4) X_train, y_train, X_test, y_test, cat = load_task(task_id) iterative_wo_early_stopping = ['extra_trees', 'PassiveAggressiveWOEarlyStopping', 'random_forest', 'SGDWOEarlyStopping', 'GradientBoostingClassifierWOEarlyStopping'] iterative_w_early_stopping = ['extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'gradient_boosting'] if not early_stopping: add_classifier_wo_early_stopping() if searchspace == "iterative": include_estimator = iterative_w_early_stopping if early_stopping else iterative_wo_early_stopping include_preprocessor = ["no_preprocessing", ] elif searchspace == "iterative-preproc": include_estimator = iterative_w_early_stopping if early_stopping else iterative_wo_early_stopping include_preprocessor = None elif searchspace == "full":