def calculate(idd, n_jobs, debug): """Executes FS and RPS for a given task. Executed by cluster. Args: idd (int): Jobid. Used to find the right task n_jobs (int): Number of parallel jobs made by the random parameter search. Does nothing otherwise. """ # task = Foldnr, mask, clfname, ftlist, fname, randseed task = b.loadfile(f"{tmpdirectory}/tasks.json")[ idd] # = (foldnr, fstype, args, clfname, randseed) or (foldnr, clfname, randseed) foldxy = np.load(f"{tmpdirectory}/folds.pkl", allow_pickle=True)[task[0]] df = pd.read_pickle(f"{tmpdirectory}/dataframe.pkl") if len(task) == 5: # Normal procedure with Feature Selection first. foldnr, fstype, args, clfname, randseed = task ftlist, mask, fname = fs.feature_selection(foldxy, fstype, args, df) # FS - Done. elif len(task) == 3: # A set featurelist was used. foldnr, clfname, randseed = task ftlist = b.loadfile(f"{tmpdirectory}/set_fl.json") mask = [True if f in ftlist else False for f in df.columns] fname = "Set Featurelist" else: raise ValueError("Incorrect number of arguments in the taskfile: {len(task)} should be 5 or 3") scores, best_esti, y_labels, coefs = rps.random_param_search(mask, clfname, foldxy, n_jobs, df, randseed, debug) ###### best_esti_params = best_esti.get_params() best_esti = (type(best_esti).__name__, best_esti_params) # Creates readable tuple that can be dumped. b.dumpfile([foldnr, scores, best_esti, ftlist, fname, y_labels], f"{tmpdirectory}/task_results/{idd}.json")
def make_set_fl_tasks(p, n, set_fl, clfnames, n_folds, randseed): """ Similar to maketasks. Used if a set featurelist is used and the feature selection process is skipped. Args: p (list): A list containing all the feature values for all the positive samples used. n (list): A list containing all the feature values for all the negative samples used. set_fl (list): The feature list used. clfnames (list(string)): A list containing all the classifiernames used for random parameter search or specific classifiers to fit to. n_folds (int): Number of folds stratified K-Fold creates randseed (int): Randomseed used by the whole program Returns: len(tasks): The total number of tasks. """ set_fl_tasks = [] folds, df = makefolds(p, n, n_folds, randseed) for foldnr in range(n_folds): for clfname in clfnames: set_fl_tasks.append((foldnr, clfname, randseed)) b.dumpfile(set_fl, f"{tmpdirectory}/set_fl.json") b.dumpfile(set_fl_tasks, f"{tmpdirectory}/tasks.json") np.array(folds, dtype=object).dump(f"{tmpdirectory}/folds.pkl") df.to_pickle(f"{tmpdirectory}/dataframe.pkl") return len(set_fl_tasks)
def create_blacklist(path="", make_histogram="avg"): """ Main blacklist function. Creates the actual blacklist and dumps it. Args: path (str): Path to the pos/neg directories. Also dumps blacklist here make_histogram (str): If not empty, a histogram will be drawn using the given type """ if path: path += "/" seqdict, filedict = load(path) l = find_collisions(seqdict, filedict) blacklist = set() for x in l: a_loc, a_name = x[0][0].split("/")[-2:] b_loc, b_name = x[1][0].split("/")[-2:] if x[0][1][0] > x[1][1][0]: # This picks which file has a higher sum of nucleotides blacklist.add(b_name) # and adds the smaller file to the blacklist else: blacklist.add(a_name) blacklist.add("416-60776-0-1.sto") # Incompatible with RNAz b.dumpfile(list(blacklist), f"{path}blacklist.json") print(f"{len(blacklist)} blacklisted files") if make_histogram: make_histograms(filedict, blacklist, make_histogram) return blacklist
def getresults(): """Analyzes the result files in rps_results and returns only the ones with the best best_esti_score in each fold. """ results = defaultdict(lambda: [[0]]) for rfile in os.listdir(f"{tmpdirectory}/task_results"): f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}") if f[1][0] > results[f[0]][0][0] or f[1][0] == -1: # Remove this == -1 part # For each fold the result with the best best_esti_score is saved # If the best_esti_score is -1 it means a set classifier was used. results[f[0]] = f[1:] b.dumpfile(results, f"results/results.json")
def load_pn_files(use_rnaz, use_filters, numneg, randseed, debug): fn = f"{tmpdirectory}/pn_{use_rnaz}_{use_filters}_{numneg}_{randseed}_{debug}.json" # If a file with the loaded files already exists, skip loadfiles.loaddata() if os.path.isfile(fn): p, n = b.loadfile(fn) # pos, neg from loaded file else: if use_filters: p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz) else: p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz, 'both', blacklist_file="noblacklist") b.dumpfile((p, n), fn) return p, n
def maketasks(p, n, fs_selection_methods, clfnames, n_folds, randseed, debug): """ Creates and dumps tasks, dataframe and the folds created by kfold that are then read and executed by the cluster. Args: p (list): A list containing all the feature values for all the positive samples used. n (list): A list containing all the feature values for all the negative samples used. fs_selection_methods (dict): The dictionary of the Feature selection methods and their arguments. clfnames (list(string)): A list containing all the classifiernames used for random parameter search or specific classifiers to fit to. n_folds (int): Number of folds stratified K-Fold creates randseed (int): Randomseed used by the whole program debug (bool): Debug Mode (Might not actually do anything atm.) Returns: len(tasks): The total number of tasks. """ tasks = [] folds, df = makefolds(p, n, n_folds, randseed) # numfolds = n_splits for foldnr in range(n_folds): for clfname in clfnames: for fstype, parameters in fs_selection_methods.items(): for args in parameters: if fstype == "Random": num_features, num_random_tasks = args for seed in range(num_random_tasks): # Keep in mind this seed IS NOT randseed tasks.append((foldnr, fstype, (num_features, seed), clfname, randseed)) elif fstype == "Forest" or fstype == "SVC1" or fstype == "SVC2": tasks.append((foldnr, fstype, (randseed, args), clfname, randseed)) else: tasks.append((foldnr, fstype, args, clfname, randseed)) b.dumpfile(tasks, f"{tmpdirectory}/tasks.json") np.array(folds, dtype=object).dump(f"{tmpdirectory}/folds.pkl") df.to_pickle(f"{tmpdirectory}/dataframe.pkl") return len(tasks)
def getresults2(numrandomtasks=10000, n_best=10, tmpdirectory="tmp"): """ Calculates the average F1 scores random featurelist over every fold. Collects these into a histogram and dumps them into the results directory. Also takes the n_best Featurelists with the best average F1-Scores and dumps them too. Warning: This program will not work correctly if: - numrandomtasks is not the correct value - pig.py was executed with other feature selection methods than --random - pig.py was executed with more than one classifier Example: - Executed pig.py --random 40 10000 -n 7 => Results in 70000 files with 40 features each. File 0-9999 beeing 10000 different featurelists for the 1. fold. Files 10000-19999 beeing the same 10000 featurelists for the 2. fold. ... - This code will take taskid % numrandomtasks and calculate the average F1 score for each of these featurelists over every fold. => Takes 7 files each and calculates their average F1 score => Results in 10000 scores in total. Args: numrandomtasks(int): Needs to be the same number as the 2. argument of --random in pig.py, so the number of random featurelists per fold. n_best(int): Number of best featurelists that should be saved seperately tmpdirectory(String): Location of the "tmp" directory. """ score_d = defaultdict(list) avg_score_d = defaultdict(list) featurelist_d = defaultdict(list) i = 0 j = 0 for rfile in os.listdir(f"{tmpdirectory}/task_results"): taskid = int(rfile.split(".")[0]) f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}") scores = f[1] fl = f[3] # Featurelist tpr, precision = scores[2][0], scores[2][2] if np.isnan(precision): i += 1 score_d[taskid % numrandomtasks].append((tpr, precision)) # 10.000 Dictionary Entries mit 7 score tuples featurelist_d[taskid % numrandomtasks] = fl # 10.000 different Featurelists # Calculate average F1-Scores of each entry best_f1_score = 0 best_key = 0 f1_list = [] # Used for Histogram for key in score_d: sum_tpr, sum_precision = 0, 0 for tpr, precision in score_d[key]: sum_tpr += tpr sum_precision += precision avg_tpr, avg_precision = sum_tpr / len( score_d[key]), sum_precision / len(score_d[key]) f1 = 2 * ((avg_precision * avg_tpr) / (avg_precision + avg_tpr)) if np.isnan(f1): j += 1 f1_list.append(f1) avg_score_d[key] = f1 # Get the best n_best featurelists best_featurelists = {} for key in dict( sorted(avg_score_d.items(), key=itemgetter(1), reverse=True)[:n_best]).keys(): best_featurelists[key] = (avg_score_d[key], featurelist_d[key]) b.dumpfile(best_featurelists, f"results/best_featurelists.json") # Draw the histogram fontsize = 18 # Size of the text for the labels and legend. plt.figure(figsize=(12.8, 9.6)) plt.xlabel("F1-Score", fontsize=fontsize) plt.ylabel("Number of Scores", fontsize=fontsize) plt.hist(f1_list, bins=100) plt.savefig("results/f1_histogram.png")