def pass2(files_list): """ Read files_list and verify file contents @return: Queue of files with error """ files_failed_list = [] files_queue = queue.Queue() # Starting workkers worker.Reader.mFilesTotal = len(files_list) threads = [] for i in range(args.jobs): print_verbose("Started reader job {}".format(i + 1)) th = worker.Reader(files_queue, files_failed_list) th.start() threads.append(th) # Send work to queue if args.jobs > 1: # Create random queue reordered_files = files_list.copy() while len(reordered_files) > 0: index = random.randint(0, len(reordered_files) - 1) files_queue.put(reordered_files[index]) del reordered_files[index] else: for f_name in files_list: files_queue.put(f_name) # Wait queue ends files_queue.join() for th in threads: files_queue.put(None) for th in threads: th.join() return files_failed_list
def run(common_args, cmd_argv): args = docopt(__doc__, argv=cmd_argv) # -b option is not supported/needed if (args['-b'] != None): sys.exit( "The '-b' option is not supported/needed. Use a 'remote-ref' as the <id> argument" ) # Default Package name pkg = args['<repo>'] if (args['-p']): pkg = args['-p'] # Set directory for the subtree directory dst = os.path.join(args['<dst>'], pkg) dst = utils.force_unix_dir_sep(dst) utils.print_verbose(f"Location of the copy being updated: {dst}") # Update the 'subtree' cmd = f'git subtree pull --prefix {dst} {args["<origin>"]}/_git/{args["<repo>"]} {args["<id>"]} --squash' t = utils.run_shell(cmd, common_args['-v']) utils.check_results( t, "ERROR: Failed the update a subtree for the specified package/repository." )
def run( common_args, cmd_argv ): args = docopt(scm.umount.USAGE, argv=cmd_argv) # Success Msg if ( args['get-success-msg'] ): print( "Repo unmount. You will need to perform a 'git add/rm' to remove the deleted files" ) return # Error Msg if ( args['get-error-msg'] ): print( "" ) # No addition info return # -b option is not supported/needed if ( args['-b'] != None ): sys.exit( "The '-b' option is not supported/needed. Use a 'remote-ref' as the <id> argument" ) # Default Package name pkg = args['<repo>'] if ( args['-p'] ): pkg = args['-p'] # Set the foreign package directory to be deleted dst = os.path.join( args['<dst>'] , pkg ) if ( not os.path.isdir(dst) ): sys.exit( f"ERROR: The Package/Directory - {dst} - does not exist." ) utils.print_verbose( f"Package/directory being removed: {dst}" ) # The is no 'git subtree rm' command -->we just simply delete the package directory utils.set_tree_readonly( dst, False ) utils.remove_tree( dst )
def calc_files_dirs(test_size, file_size=100 * 2**20, max_files=1024): # return: (array dirs_list, array filse_list) files_required = int(test_size / file_size) + 1 dirs_required = int(files_required / max_files) + 1 # Making dirs res_dirs = [] res_files = [] print_verbose("Required {} files {} dirs ({} files per dir)".format( files_required, dirs_required, max_files)) while dirs_required > 0: res_dirs.append(args.work_dir + "/" + "{:09d}".format(dirs_required)) dirs_required -= 1 index = 0 counter = max_files while files_required > 0: file_path = "{}/{:09d}".format(res_dirs[index], counter) #file_path = args.work_dir + "/" + file_name res_files.append(file_path) counter -= 1 if counter < 0: counter = max_files index += 1 files_required -= 1 return (res_dirs, res_files)
def run(common_args, cmd_argv): args = docopt(scm.mount.USAGE, argv=cmd_argv) # Success Msg if (args['get-success-msg']): print("Repo mounted and committed to your repo") return # Error Msg if (args['get-error-msg']): print("") # No message return # Check if there are pending repo changes cmd = f'git diff-index HEAD --exit-code --quiet' t = utils.run_shell(cmd, False) cmd = f'git diff-index --cached HEAD --exit-code --quiet' t2 = utils.run_shell(cmd, False) utils.check_results( t, "ERROR: Your local repo has pending tree modification (i.e. need to do a commit/revert)." ) utils.check_results( t2, "ERROR: Your local repo has pending index modification (i.e. need to do a commit/revert)." ) # -b option is not supported/needed if (args['-b'] != None): sys.exit( "The '-b' option is not supported/needed. Use a 'remote-ref' as the <id> argument" ) # Default Package name pkg = args['<repo>'] if (args['-p']): pkg = args['-p'] # Make sure the Parent destination directory exists dst = args['<dst>'] utils.mkdirs(dst) # Set directory for the subtree directory dst = os.path.join(dst, pkg) dst = utils.force_unix_dir_sep(dst) utils.print_verbose(f"Destination for the copy: {dst}") # Create a 'subtree' cmd = f'git subtree add --prefix {dst} {args["<origin>"]}/{args["<repo>"]}.git {args["<id>"]} --squash' t = utils.run_shell(cmd, common_args['-v']) if (utils.is_error(t)): # Clean-up dst dir if there was failure utils.remove_tree(dst) utils.check_results( t, "ERROR: Failed to create a subtree for the specified package/repository." )
def read_matrix_from_file(filename, no_rows, no_cols): try: print_verbose("Trying to read matrix from ", filename, " with size ", no_rows, "x", no_cols) f = open(filename) lines = f.readlines() return parse_matrix(lines, no_rows, no_cols) except: print "Failed to read matrix from " + filename return None
def createFile(self, file_name, binary_pattern, size=1024, seed=0): blocks = int(size / len(binary_pattern)) try: with open(file_name, "wb") as binary_file: for i in range(blocks): res = binary_file.write(binary_pattern) except OSError as e: if e.errno == 28: print_verbose("Disk full") pass
def pass3(dirs_list, files_list): # TODO: Remove dirs print_verbose("Removing files ") for file_name in files_list: os.remove(file_name) dirs_list.append(args.work_dir) for d in dirs_list: try: os.rmdir(d) except OSError as e: print_err("Cannot delete directory '{}': {}".format(d, e.strerror)) print_verbose("Files removed. Test finished")
def write_matrix_to_file(filename, Mat): try: print_verbose("Trying to write matrix in ", filename) f = open(filename, 'w+') for line in Mat: three_decimal_row = ["%.3f" % i for i in line] line = "" for num in three_decimal_row: line += str(num) + " " f.write(line + "\n") f.close() except: print "Couldn't write matrix into file " + filename raise
def run(common_args, cmd_argv): args = docopt(scm.copy.USAGE, argv=cmd_argv) # Use the mount command so as to have consistent pre/post GIT behavior with adopting non-integrated packages if (not args['--force']): cmd_argv[0] = 'mount' cmd_argv.insert(1, '--noro') scm.git.mount.run(common_args, cmd_argv) # Do a brute force copy else: # -b option is not supported/needed if (args['-b'] != None): sys.exit( "The '-b' option is not supported/needed. Use a 'remote-ref' as the <id> argument" ) # Default Package name pkg = args['<repo>'] if (args['-p']): pkg = args['-p'] # Make sure the destination directory exists dst = os.path.join(os.getcwd(), args['<dst>']) utils.print_verbose(f"Destination for the copy: {dst}") utils.mkdirs(dst) # Create a clone of the repo # NOTE: I hate cloning the entire repo - but I have not found a way to get JUST a snapshot by a remote-ref cmd = f'git clone --branch {args["<id>"]} --depth=1 {args["<origin>"]}/_git/{args["<repo>"]} {pkg}' utils.push_dir(dst) t = utils.run_shell(cmd, common_args['-v']) utils.pop_dir() if (utils.is_error(t)): # Clean-up dst dir if there was failure utils.remove_tree(dst) utils.check_results( t, f"ERROR: Failed the retreive/clone the specified package/repository. Note: the <id> ({args['<id>']}) MUST be a git TAG." ) # Remove the .git directoy since this is a non-tracked copy gitdir = os.path.join(dst, pkg, ".git") utils.remove_tree( gitdir, warn_msg="Not able to remove the .git directory for local copy")
def run(self): args = __main__.args # FIXME: total_files = self.mInQueue.qsize() while True: file_name = self.mInQueue.get() if file_name is None: break start_time = time.time() self.createFile(file_name, BINARY_PATTERN, args.file_size) self.mOutList.append(file_name) elapsed_time = time.time() - start_time files_left = self.mInQueue.qsize() percent = int(50 * (total_files - files_left) / total_files) print_verbose( "[{:02d}%,{},{} free] Created file {} [{}/s]".format( percent, files_left, format_human(get_free_space(args.work_dir)), file_name, format_human(int(args.file_size / elapsed_time)))) files_left -= 1 self.mInQueue.task_done()
def run(self): while True: file_name = self.mInQueue.get() if file_name is None: break offset = 0 start_time = time.time() with open(file_name, "rb") as binary_file: error = False while not error: try: res = binary_file.read(256) except OSError as e: error = True print_err(e) if not res: break elif res != BINARY_PATTERN[0:len(res)]: self.mOutList.append(file_name) error = True offset += len(res) elapsed_time = time.time() - start_time files_left = Reader.mFilesTotal - Reader.mFilesProcessed percent = int(50 + 50 * Reader.mFilesProcessed / Reader.mFilesTotal) str_out = "[{:02d}% {} left {} err] File '{}' ".format( percent, files_left, len(self.mOutList), file_name) if not error: str_out += "[{} {}/s] Ok".format( format_human(offset), format_human(int(offset / elapsed_time))) else: str_out += "Error ar offset 0x{:X}".format(offset) print_verbose(str_out) with Reader.mLock: Reader.mFilesProcessed += 1 self.mInQueue.task_done()
def pass1(dirs_list, files_list): """ Create files_queue using a pattern @return List of files created """ files_created = [] print_verbose("Test size {}, file size {}.".format( format_human(args.test_size), format_human(args.file_size))) # Creating temporal dirs for dir_name in dirs_list: if not check_dir(dir_name, create=True): print_err("Cannot continue, {} is not a directory".format(new_dir)) else: print_verbose("Created directory {}".format(dir_name)) # Create worker queue, if jobs > 1 create a random list files_queue = queue.Queue() if args.jobs > 1: reordered_files = files_list.copy() while len(reordered_files) > 0: index = random.randint(0, len(reordered_files) - 1) files_queue.put(reordered_files[index]) del reordered_files[index] else: for f_name in files_list: files_queue.put(f_name) # Starting workkers threads = [] for i in range(args.jobs): print_verbose("Started writer job {}".format(i + 1)) th = worker.Writer(files_queue, files_created) th.start() threads.append(th) files_queue.join() for th in threads: files_queue.put(None) for th in threads: th.join() return files_created
os.rmdir(d) except OSError as e: print_err("Cannot delete directory '{}': {}".format(d, e.strerror)) print_verbose("Files removed. Test finished") args = parse_commanline() files_failed_list = [] files_list = [] dirs_list = [] if getattr(args, '2', False): # Only do pass 2 existent_files_list = [] print_verbose("Running only pass 2, files verification") for root, dirs, files in os.walk(args.work_dir, topdown=False): for name in files: existent_files_list.append(os.path.join(root, name)) print_verbose("Found {} files".format(len(existent_files_list))) files_failed_list = pass2(existent_files_list) else: # Do pass1 pass2 and pass3 dirs_list, files_list = calc_files_dirs(args.test_size, args.file_size, args.files_dir) files_queue = pass1(dirs_list, files_list) files_failed_list = pass2(files_queue) if len(files_failed_list) > 0: print_err("Error: Test not passed, temporary files not removed") else:
def main(run_dir="rfe_chain", start=None, start_auc=None, verbose=None, logfile=None): """ Main function to run the chain. """ if logfile is not None: sys.stdout = open(logfile, "w") # load starting json with open(start) as f: start = json.load(f) if start_auc is None: startauc = 0.8 start['AUC_SCORE_PATH'] = run_dir # have to load a list of possible features to replace with if all("10feat" in feature for feature in start['FEATURES']): with open("10featlist.json") as fh: featlist = json.load(fh)['FEATURES'] else: featlist = get_featlist() # and possible preceding modifiers modlist = get_modlist() # creat list of combinations of these two lists comblist = [] for mod in modlist: for feature in featlist: comblist.append('{0}_{1}_'.format(mod, feature)) # define sampled json prevsample = copy.deepcopy(start) # initialise auc prevauc = startauc first = 1 counter = 0 converged = False # will decide what constitutes converged later while not converged: sample = copy.deepcopy(prevsample) # If this isn't the first one, sample new settings if not first: # Sample a new hdf5 and replace existing at random # Or, just push it in, or just drop a hdf5 at random utils.print_verbose( "===== Sampling new proposal " "settings ======", flag=verbose) # sample new settings # shuffle combinations random.shuffle(comblist) # pop 3 features off this added = [comblist.pop() for i in range(3)] # add them to the settings sample['FEATURES'] = added utils.print_verbose( "============================" "===============", flag=verbose) # ensure that ordering of the features is the same between jsons sample['FEATURES'].sort() # Then save this new json with a descriptive name # unless it's already been generated if first: featurerecord = "".join(sample['FEATURES']) else: featurerecord = featurerecord + "".join(sample['FEATURES']) md5name = hashlib.md5(featurerecord.encode('UTF-8')).hexdigest() # get a list of the files in the run_dir existingjsons = glob.glob(run_dir + "/*.json") # check if the md5 exists if md5name + ".json" in existingjsons: # then load the results of that run with open(os.path.join(run_dir, "AUC_scores.csv"), "r") as fh: c = csv.reader(fh, delimiter="\t") utils.print_verbose("Already ran {0}," "reading from results.".format(md5name), flag=verbose) for line in c: # look for that md5sum if md5name in line[0]: auc_score = line[-1] else: # save a json with this name and run train.py on it samplefname = os.path.join(run_dir, md5name + ".json") utils.print_verbose("Creating new settings" " file for {0}".format(samplefname), flag=verbose) with open(samplefname, "w") as fh: json.dump(sample, fh) # call train.py try: if first: auc_score_dict = train.main(samplefname, verbose=verbose, store_models=False, store_features=True) else: picklefname = prevsamplefname.split(".")[0] + \ "_feature_dump.pickle" # load the features saved in the last run auc_score_dict = train.main(samplefname, verbose=verbose, store_models=False, store_features=True, load_pickled=picklefname) prevsamplefname = samplefname auc_score = auc_score_dict['all'] except IndexError: print("Warning: accidentally added invalid feature.") os.remove(samplefname) # set auc to zero so these settings are not accepted auc_score = 0 prevsample = sample # can't be first anymore first = 0 # as it may be bad manners to run infinite loops counter += 1 if counter > 100: converged = True return None
def main(settingsfname, verbose=False): settings = utils.get_settings(settingsfname) subjects = settings['SUBJECTS'] data = utils.get_data(settings, verbose=verbose) metadata = utils.get_metadata() features_that_parsed = [ feature for feature in settings['FEATURES'] if feature in list(data.keys()) ] settings['FEATURES'] = features_that_parsed utils.print_verbose("=====Feature HDF5s parsed=====", flag=verbose) # get model model_pipe = utils.build_model_pipe(settings) utils.print_verbose("=== Model Used ===\n" "{0}\n==================".format(model_pipe), flag=verbose) # dictionary to store results subject_predictions = {} accuracy_scores = {} for subject in subjects: utils.print_verbose("=====Training {0} Model=====".format( str(subject)), flag=verbose) # initialise the data assembler assembler = utils.DataAssembler(settings, data, metadata) X, y = assembler.test_train_discrimination(subject) # get the CV iterator cv = utils.sklearn.cross_validation.StratifiedShuffleSplit( y, random_state=settings['R_SEED'], n_iter=settings['CVITERCOUNT']) # initialise lists for cross-val results predictions = [] labels = [] allweights = [] # run cross validation and report results for train, test in cv: # calculate the weights weights = utils.get_weights(y[train]) # fit the model to the training data model_pipe.fit(X[train], y[train], clf__sample_weight=weights) # append new predictions predictions.append(model_pipe.predict(X[test])) # append test weights to store (why?) (used to calculate auc below) weights = utils.get_weights(y[test]) allweights.append(weights) # store true labels labels.append(y[test]) # stack up the results predictions = utils.np.hstack(predictions) labels = utils.np.hstack(labels) weights = utils.np.hstack(allweights) # calculate the total accuracy accuracy = utils.sklearn.metrics.accuracy_score(labels, predictions, sample_weight=weights) print("Accuracy score for {1}: {0:.3f}".format(accuracy, subject)) # add AUC scores to a subj dict accuracy_scores.update({subject: accuracy}) # store results from each subject subject_predictions[subject] = (predictions, labels, weights) # stack subject results (don't worrry about this line) predictions, labels, weights = map( utils.np.hstack, zip(*list(subject_predictions.values()))) # calculate global accuracy accuracy = utils.sklearn.metrics.accuracy_score(labels, predictions, sample_weight=weights) print( "predicted accuracy score over all subjects: {0:.2f}".format(accuracy)) # output AUC scores to file accuracy_scores.update({'all': accuracy}) settings['DISCRIMINATE'] = 'accuracy_scores.csv' # settings['AUC_SCORE_PATH'] = 'discriminate_scores' utils.output_auc_scores(accuracy_scores, settings) return accuracy_scores
def main(settingsfname, verbose=False): settings = utils.get_settings(settingsfname) subjects = settings['SUBJECTS'] data = utils.get_data(settings, verbose=verbose) metadata = utils.get_metadata() features_that_parsed = [feature for feature in settings['FEATURES'] if feature in list(data.keys())] settings['FEATURES'] = features_that_parsed utils.print_verbose("=====Feature HDF5s parsed=====", flag=verbose) # get model model_pipe = utils.build_model_pipe(settings) utils.print_verbose("=== Model Used ===\n" "{0}\n==================".format(model_pipe), flag=verbose) # dictionary to store results subject_predictions = {} accuracy_scores = {} for subject in subjects: utils.print_verbose( "=====Training {0} Model=====".format(str(subject)), flag=verbose) # initialise the data assembler assembler = utils.DataAssembler(settings, data, metadata) X, y = assembler.test_train_discrimination(subject) # get the CV iterator cv = utils.sklearn.cross_validation.StratifiedShuffleSplit( y, random_state=settings['R_SEED'], n_iter=settings['CVITERCOUNT']) # initialise lists for cross-val results predictions = [] labels = [] allweights = [] # run cross validation and report results for train, test in cv: # calculate the weights weights = utils.get_weights(y[train]) # fit the model to the training data model_pipe.fit(X[train], y[train], clf__sample_weight=weights) # append new predictions predictions.append(model_pipe.predict(X[test])) # append test weights to store (why?) (used to calculate auc below) weights = utils.get_weights(y[test]) allweights.append(weights) # store true labels labels.append(y[test]) # stack up the results predictions = utils.np.hstack(predictions) labels = utils.np.hstack(labels) weights = utils.np.hstack(allweights) # calculate the total accuracy accuracy = utils.sklearn.metrics.accuracy_score(labels, predictions, sample_weight=weights) print("Accuracy score for {1}: {0:.3f}".format(accuracy, subject)) # add AUC scores to a subj dict accuracy_scores.update({subject: accuracy}) # store results from each subject subject_predictions[subject] = (predictions, labels, weights) # stack subject results (don't worrry about this line) predictions, labels, weights = map(utils.np.hstack, zip(*list(subject_predictions.values()))) # calculate global accuracy accuracy = utils.sklearn.metrics.accuracy_score(labels, predictions, sample_weight=weights) print( "predicted accuracy score over all subjects: {0:.2f}".format(accuracy)) # output AUC scores to file accuracy_scores.update({'all': accuracy}) settings['DISCRIMINATE'] = 'accuracy_scores.csv' # settings['AUC_SCORE_PATH'] = 'discriminate_scores' utils.output_auc_scores(accuracy_scores, settings) return accuracy_scores
def main(run_dir="rfe_chain", start=None, start_auc=None, verbose=None, logfile=None): """ Main function to run the chain. """ if logfile is not None: sys.stdout = open(logfile, "w") # load starting json with open(start) as f: start = json.load(f) if start_auc is None: startauc = 0.8 start['AUC_SCORE_PATH'] = run_dir # have to load a list of possible features to replace with if all("10feat" in feature for feature in start['FEATURES']): with open("10featlist.json") as fh: featlist = json.load(fh)['FEATURES'] else: featlist = get_featlist() # and possible preceding modifiers modlist = get_modlist() # creat list of combinations of these two lists comblist = [] for mod in modlist: for feature in featlist: comblist.append('{0}_{1}_'.format(mod, feature)) # define sampled json prevsample = copy.deepcopy(start) # initialise auc prevauc = startauc first = 1 counter = 0 converged = False # will decide what constitutes converged later while not converged: sample = copy.deepcopy(prevsample) # If this isn't the first one, sample new settings if not first: # Sample a new hdf5 and replace existing at random # Or, just push it in, or just drop a hdf5 at random utils.print_verbose("===== Sampling new proposal " "settings ======", flag=verbose) # sample new settings # shuffle combinations random.shuffle(comblist) # pop 3 features off this added = [comblist.pop() for i in range(3)] # add them to the settings sample['FEATURES'] = added utils.print_verbose("============================" "===============", flag=verbose) # ensure that ordering of the features is the same between jsons sample['FEATURES'].sort() # Then save this new json with a descriptive name # unless it's already been generated if first: featurerecord = "".join(sample['FEATURES']) else: featurerecord = featurerecord + "".join(sample['FEATURES']) md5name = hashlib.md5(featurerecord.encode('UTF-8')).hexdigest() # get a list of the files in the run_dir existingjsons = glob.glob(run_dir + "/*.json") # check if the md5 exists if md5name + ".json" in existingjsons: # then load the results of that run with open(os.path.join(run_dir, "AUC_scores.csv"), "r") as fh: c = csv.reader(fh, delimiter="\t") utils.print_verbose("Already ran {0}," "reading from results.".format(md5name), flag=verbose) for line in c: # look for that md5sum if md5name in line[0]: auc_score = line[-1] else: # save a json with this name and run train.py on it samplefname = os.path.join(run_dir, md5name + ".json") utils.print_verbose("Creating new settings" " file for {0}".format(samplefname), flag=verbose) with open(samplefname, "w") as fh: json.dump(sample, fh) # call train.py try: if first: auc_score_dict = train.main(samplefname, verbose=verbose, store_models=False, store_features=True) else: picklefname = prevsamplefname.split(".")[0] + \ "_feature_dump.pickle" # load the features saved in the last run auc_score_dict = train.main(samplefname, verbose=verbose, store_models=False, store_features=True, load_pickled=picklefname) prevsamplefname = samplefname auc_score = auc_score_dict['all'] except IndexError: print("Warning: accidentally added invalid feature.") os.remove(samplefname) # set auc to zero so these settings are not accepted auc_score = 0 prevsample = sample # can't be first anymore first = 0 # as it may be bad manners to run infinite loops counter += 1 if counter > 100: converged = True return None
def main(mcmcdir="hdf5mcmc", start=None, start_auc=None, verbose=True, logfile=None, discr_flag=False): """ Contains the main loop for this script. Pseudo-MHMCMC to find optimal AUC scoring combinations of HDF5s. start - location of json file settings to begin at """ if logfile is not None: sys.stdout = open(logfile, "w") # pseudo-code for the MCMC iteration # want it to start with the probably good features with open(start) as f: start = json.load(f) if start_auc is None: startauc = 0.8 # hardcode AUC results to the hdf5mcmc directory start['AUC_SCORE_PATH'] = mcmcdir # have to load a list of possible features to replace with if all("10feat" in feature for feature in start['FEATURES']): with open("10featlist.json") as fh: featlist = json.load(fh)['FEATURES'] else: featlist = get_featlist() # and possible preceding modifiers modlist = get_modlist() # define sampled json prevsample = copy.deepcopy(start) # initialise auc prevauc = startauc counter = 0 converged = False # will decide what constitutes converged later while not converged: sample = copy.deepcopy(prevsample) # Sample a new hdf5 and replace existing at random # Or, just push it in, or just drop a hdf5 at random utils.print_verbose("===== Sampling new proposal " "settings ======", flag=verbose) u = np.random.rand() if u < 0.25: # drop an element at random features = sample['FEATURES'][:] random.shuffle(features) dropped = features.pop() sample['FEATURES'] = features utils.print_verbose("Dropped feature {0}".format(dropped), flag=verbose) elif u > 0.25 and u < 0.5: # keep trying to sample a new feature until we # find one that's not in there already while True: # push a new feature, but don't remove an old one newfeature = random.sample(featlist, 1)[0] newmod = random.sample(modlist, 1)[0] added = '{0}_{1}_'.format(newmod, newfeature) if added not in sample['FEATURES']: break sample['FEATURES'].append(added) utils.print_verbose("Added feature {0}".format(added), flag=verbose) elif u > 0.5: # push a new feature and remove an old one features = sample['FEATURES'][:] random.shuffle(features) dropped = features.pop() # keep trying to sample a new feature until we # find one that's not in there already while True: # push a new feature, but don't remove an old one newfeature = random.sample(featlist, 1)[0] newmod = random.sample(modlist, 1)[0] added = '{0}_{1}_'.format(newmod, newfeature) if added not in sample['FEATURES']: break features.append(added) sample['FEATURES'] = features utils.print_verbose("Switched feature {0} for " "{1}".format(dropped, added), flag=verbose) utils.print_verbose("============================" "===============", flag=verbose) # ensure that ordering of the features is the same between jsons sample['FEATURES'].sort() # Then save this new json with a descriptive name # unless it's already been generated md5name = hashlib.md5("".join( sample['FEATURES']).encode('UTF-8')).hexdigest() # get a list of the files in the mcmcdir existingjsons = glob.glob(mcmcdir + "/*.json") # check if the md5 exists if md5name + ".json" in existingjsons: # then load the results of that run with open(os.path.join(mcmcdir, "AUC_scores.csv"), "r") as fh: c = csv.reader(fh, delimiter="\t") utils.print_verbose("Already ran {0}," "reading from results.".format(md5name), flag=verbose) for line in c: # look for that md5sum if md5name in line[0]: auc_score = line[-1] else: # save a json with this name and run train.py on it samplefname = os.path.join(mcmcdir, md5name + ".json") utils.print_verbose("Creating new settings" " file for {0}".format(samplefname), flag=verbose) with open(samplefname, "w") as fh: json.dump(sample, fh) # call train.py or discriminate.py if discr_flag: try: auc_score_dict = discriminate.main(samplefname, verbose=verbose) # don't want to rename this variable # even though it is no longer an AUC score # want a low accuracy score, strangely enough auc_score = 1 - auc_score_dict['all'] except IndexError: print("Warning: accidentally added invalid feature.") os.remove(samplefname) # set auc to zero so these settings are not accepted auc_score = 0 else: try: auc_score_dict = train.main(samplefname, verbose=verbose, store_models=False) auc_score = auc_score_dict['all'] - 0.5 except IndexError: print("Warning: accidentally added invalid feature.") os.remove(samplefname) # set auc to zero so these settings are not accepted auc_score = 0 utils.print_verbose("==== Acceptance calculation ====", flag=verbose) # compute acceptance probability from AUC: # r = min(1,AUC/(previous AUC)) acceptance = np.max([np.min([1, auc_score / prevauc]), 0]) u = np.random.rand() # accept new point with probability r if u < acceptance: prevsample = sample # save current auc prevauc = auc_score utils.print_verbose("accepting new settings with probability " "{0}".format(acceptance), flag=verbose) else: utils.print_verbose("rejecting new settings with probability " "{0}".format(1.0 - acceptance), flag=verbose) utils.print_verbose("================================", flag=verbose) # otherwise it will not overwrite prevsample, so continue from where it # was # as it may be bad manners to run infinite loops counter += 1 if counter > 100: converged = True
"are ok and test is active." continue A = read_matrix_from_file(fnameA, tests[test]['M'], tests[test]['K']) B = read_matrix_from_file(fnameB, tests[test]['K'], tests[test]['N']) C = read_matrix_from_file(fnameC, tests[test]['M'], tests[test]['N']) params = tests[test] print_verbose("Runing test", test) ts = datetime.datetime.now() result = f_dgemm(params['TRANSA'], params['TRANSB'], params['M'], params['N'], params['K'], params['ALPHA'], A, params['LDA'], B, params['LDB'], params['BETA'], C, params['LDC']) te = datetime.datetime.now() print_verbose("Test ", test, "done in", te-ts) #print_matrix(A, "-----Matrix A-----", "----------") #print_matrix(B, "-----Matrix B-----", "----------") #print_matrix(C, "-----Matrix C-----", "----------") top = "----Test " + test + ": " + str(params['ALPHA']) +\ "*A*B + " + str(params['BETA']) + "*C----" #print_matrix(result, top, "--------") write_matrix_to_file(fnameRes, result)
def f_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC): """ Return Alpha * A * B + C. N N : A[M][K] * B[K][N] + C[M][N] N T : A[M][K] * B[N][K] + C[M][N] -> N = K T N : A[K][M] * B[K][N] + C[M][N] -> M = K T T : A[K][M] * B[N][K] + C[M][N] -> M = N = K Keyword arguments: ======================================================== M -- int M >= 0, number of rows of the matrix A and C N -- int N >= 0, number of columns of the matrix B and C K -- int K >= 0, number of columns of the matrix A and B ALPHA -- double precision float scalar aplha A -- matrix of double precision floats [LDA][ka] ka -- K for TRANSA // m otherwise LDA -- integer : first dimension of A. When TRANSA = 'N', LDA = max(1, M), otherwise LDA = max(1, K) B -- matrix of double precision floats [LDB][kb] kb -- N for TRANSB // k otherwise LDB -- integer : first dimension of B. When TRANSB = 'N', LDB = max(1, K), otherwise LDB = max(1, N) BETA -- double precision float scalar beta C -- matrix of double precision floats [LDC][n]. Matrix C will be overwritten with the result matrix LDC - first dimension of matrix C, equal to max(1, M). ========================================================== """ nota = (TRANSA == 'N') notb = (TRANSB == 'N') #check if A is transposed nrowa = K ncola = M if nota: print_verbose("Matrix A is not transposed") nrowa = M ncola = K #check if B is transposed nrowb = N if notb: print_verbose("Matrix B is not transposed") nrowb = K ''' Test input parameters ''' if not nota and TRANSA != 'C' and TRANSA != 'T': perror("Wrong TRANSA parameter") elif not notb and TRANSB != 'C' and TRANSB != 'T': perror("Wrong TRANSB parameter") elif M < 0: perror("M < 0") elif N < 0: perror("N < 0") elif K < 0: perror("K < 0") elif LDA < max(1, nrowa): perror("LDA lower than max(1, A_#rows)") elif LDB < max(1, nrowb): perror("LDB lower than max(1, B_#rows)") elif LDC < max(1, M): perror("LDC lower than max(1, m)") print_verbose("Alpha:", ALPHA) print_verbose("Beta:", BETA) # Quick return if M == 0\ or N == 0\ or ((f_equal(ALPHA, 0.0) or (K == 0)) and f_equal(BETA, 1.0)): return C # If ALPHA is 0.0 if f_equal(ALPHA, 0.0): if f_equal(BETA, 0.0): for j in xrange(1, N): for i in xrange(1, M): C[i][j] = 0.0 else: for j in xrange(1, N): for i in xrange(1, M): C[i][j] = BETA * C[i][j] # Start the operations if notb: if nota: # Form C := alpha*A*B + beta*C for j in xrange(N): if f_equal(BETA, 0.0): for i in xrange(M): C[i][j] = 0.0 elif not f_equal(BETA, 1.0): for i in xrange(M): C[i][j] = BETA * C[i][j] for l in xrange(K): temp = ALPHA * B[l][j] for i in xrange(M): C[i][j] = C[i][j] + temp * A[i][l] else: # FORM C := alpha*A**T*B + beta*C for j in xrange(N): for i in xrange(M): temp = 0.0 for l in xrange(K): temp = temp + A[l][i] * B[l][j] if f_equal(BETA, 0.0): C[i][j] = ALPHA * temp else: C[i][j] = ALPHA * temp + BETA * C[i][j] else: if nota: # Form C := alpha*A*B**T + beta*C for j in xrange(N): if f_equal(BETA, 0): for i in xrange(M): C[i][j] = 0.0 elif not f_equal(BETA, 1.0): for i in xrange(M): C[i][j] = BETA * C[i][j] for l in xrange(K): #print K, len(B[j]) temp = ALPHA * B[j][l] for i in xrange(M): C[i][j] = C[i][j] + temp * A[i][l] else: # Form C := alpha*A**T*B**T + beta*C for j in xrange(N): for i in xrange(M): temp = 0.0 for l in xrange(K): temp = temp + A[l][i] * B[j][l] if f_equal(BETA, 0.0): C[i][j] = ALPHA * temp else: C[i][j] = ALPHA * temp + BETA * C[i][j] # End of function return C
def main(mcmcdir="hdf5mcmc", start=None, start_auc=None, verbose=True, logfile=None, discr_flag=False): """ Contains the main loop for this script. Pseudo-MHMCMC to find optimal AUC scoring combinations of HDF5s. start - location of json file settings to begin at """ if logfile is not None: sys.stdout = open(logfile, "w") # pseudo-code for the MCMC iteration # want it to start with the probably good features with open(start) as f: start = json.load(f) if start_auc is None: startauc = 0.8 # hardcode AUC results to the hdf5mcmc directory start['AUC_SCORE_PATH'] = mcmcdir # have to load a list of possible features to replace with if all("10feat" in feature for feature in start['FEATURES']): with open("10featlist.json") as fh: featlist = json.load(fh)['FEATURES'] else: featlist = get_featlist() # and possible preceding modifiers modlist = get_modlist() # define sampled json prevsample = copy.deepcopy(start) # initialise auc prevauc = startauc counter = 0 converged = False # will decide what constitutes converged later while not converged: sample = copy.deepcopy(prevsample) # Sample a new hdf5 and replace existing at random # Or, just push it in, or just drop a hdf5 at random utils.print_verbose("===== Sampling new proposal " "settings ======", flag=verbose) u = np.random.rand() if u < 0.25: # drop an element at random features = sample['FEATURES'][:] random.shuffle(features) dropped = features.pop() sample['FEATURES'] = features utils.print_verbose( "Dropped feature {0}".format(dropped), flag=verbose) elif u > 0.25 and u < 0.5: # keep trying to sample a new feature until we # find one that's not in there already while True: # push a new feature, but don't remove an old one newfeature = random.sample(featlist, 1)[0] newmod = random.sample(modlist, 1)[0] added = '{0}_{1}_'.format(newmod, newfeature) if added not in sample['FEATURES']: break sample['FEATURES'].append(added) utils.print_verbose( "Added feature {0}".format(added), flag=verbose) elif u > 0.5: # push a new feature and remove an old one features = sample['FEATURES'][:] random.shuffle(features) dropped = features.pop() # keep trying to sample a new feature until we # find one that's not in there already while True: # push a new feature, but don't remove an old one newfeature = random.sample(featlist, 1)[0] newmod = random.sample(modlist, 1)[0] added = '{0}_{1}_'.format(newmod, newfeature) if added not in sample['FEATURES']: break features.append(added) sample['FEATURES'] = features utils.print_verbose("Switched feature {0} for " "{1}".format(dropped, added), flag=verbose) utils.print_verbose("============================" "===============", flag=verbose) # ensure that ordering of the features is the same between jsons sample['FEATURES'].sort() # Then save this new json with a descriptive name # unless it's already been generated md5name = hashlib.md5( "".join(sample['FEATURES']).encode('UTF-8')).hexdigest() # get a list of the files in the mcmcdir existingjsons = glob.glob(mcmcdir + "/*.json") # check if the md5 exists if md5name + ".json" in existingjsons: # then load the results of that run with open(os.path.join(mcmcdir, "AUC_scores.csv"), "r") as fh: c = csv.reader(fh, delimiter="\t") utils.print_verbose("Already ran {0}," "reading from results.".format(md5name), flag=verbose) for line in c: # look for that md5sum if md5name in line[0]: auc_score = line[-1] else: # save a json with this name and run train.py on it samplefname = os.path.join(mcmcdir, md5name + ".json") utils.print_verbose("Creating new settings" " file for {0}".format(samplefname), flag=verbose) with open(samplefname, "w") as fh: json.dump(sample, fh) # call train.py or discriminate.py if discr_flag: try: auc_score_dict = discriminate.main(samplefname, verbose=verbose) # don't want to rename this variable # even though it is no longer an AUC score # want a low accuracy score, strangely enough auc_score = 1 - auc_score_dict['all'] except IndexError: print("Warning: accidentally added invalid feature.") os.remove(samplefname) # set auc to zero so these settings are not accepted auc_score = 0 else: try: auc_score_dict = train.main(samplefname, verbose=verbose, store_models=False) auc_score = auc_score_dict['all'] - 0.5 except IndexError: print("Warning: accidentally added invalid feature.") os.remove(samplefname) # set auc to zero so these settings are not accepted auc_score = 0 utils.print_verbose("==== Acceptance calculation ====", flag=verbose) # compute acceptance probability from AUC: # r = min(1,AUC/(previous AUC)) acceptance = np.max([np.min([1, auc_score / prevauc]), 0]) u = np.random.rand() # accept new point with probability r if u < acceptance: prevsample = sample # save current auc prevauc = auc_score utils.print_verbose("accepting new settings with probability " "{0}".format(acceptance), flag=verbose) else: utils.print_verbose("rejecting new settings with probability " "{0}".format(1.0 - acceptance), flag=verbose) utils.print_verbose("================================", flag=verbose) # otherwise it will not overwrite prevsample, so continue from where it # was # as it may be bad manners to run infinite loops counter += 1 if counter > 100: converged = True