Example #1
0
File: pig.py Project: smautner/pig
def calculate(idd, n_jobs, debug):
    """Executes FS and RPS for a given task. Executed by cluster.

    Args:
      idd (int): Jobid. Used to find the right task
      n_jobs (int): Number of parallel jobs made by the
                    random parameter search. Does nothing otherwise.
    """
    # task = Foldnr, mask, clfname, ftlist, fname, randseed
    task = b.loadfile(f"{tmpdirectory}/tasks.json")[
        idd]  # = (foldnr, fstype, args, clfname, randseed) or (foldnr, clfname, randseed)
    foldxy = np.load(f"{tmpdirectory}/folds.pkl", allow_pickle=True)[task[0]]
    df = pd.read_pickle(f"{tmpdirectory}/dataframe.pkl")
    if len(task) == 5:  # Normal procedure with Feature Selection first.
        foldnr, fstype, args, clfname, randseed = task
        ftlist, mask, fname = fs.feature_selection(foldxy, fstype, args, df)  # FS - Done.
    elif len(task) == 3:  # A set featurelist was used.
        foldnr, clfname, randseed = task
        ftlist = b.loadfile(f"{tmpdirectory}/set_fl.json")
        mask = [True if f in ftlist else False for f in df.columns]
        fname = "Set Featurelist"
    else:
        raise ValueError("Incorrect number of arguments in the taskfile: {len(task)} should be 5 or 3")
    scores, best_esti, y_labels, coefs = rps.random_param_search(mask, clfname, foldxy, n_jobs, df, randseed,
                                                                 debug)  ######
    best_esti_params = best_esti.get_params()
    best_esti = (type(best_esti).__name__, best_esti_params)  # Creates readable tuple that can be dumped.
    b.dumpfile([foldnr, scores, best_esti, ftlist, fname, y_labels], f"{tmpdirectory}/task_results/{idd}.json")
Example #2
0
File: pig.py Project: smautner/pig
def make_set_fl_tasks(p, n, set_fl, clfnames, n_folds, randseed):
    """
    Similar to maketasks. Used if a set featurelist is used
    and the feature selection process is skipped.

    Args:
      p (list): A list containing all the feature values
                for all the positive samples used.
      n (list): A list containing all the feature values
                for all the negative samples used.
      set_fl (list): The feature list used.
      clfnames (list(string)): A list containing all the classifiernames used
                               for random parameter search or specific classifiers
                               to fit to.
      n_folds (int): Number of folds stratified K-Fold creates
      randseed (int): Randomseed used by the whole program

    Returns:
      len(tasks): The total number of tasks.
    """
    set_fl_tasks = []
    folds, df = makefolds(p, n, n_folds, randseed)
    for foldnr in range(n_folds):
        for clfname in clfnames:
            set_fl_tasks.append((foldnr, clfname, randseed))
    b.dumpfile(set_fl, f"{tmpdirectory}/set_fl.json")
    b.dumpfile(set_fl_tasks, f"{tmpdirectory}/tasks.json")
    np.array(folds, dtype=object).dump(f"{tmpdirectory}/folds.pkl")
    df.to_pickle(f"{tmpdirectory}/dataframe.pkl")
    return len(set_fl_tasks)
Example #3
0
def create_blacklist(path="", make_histogram="avg"):
    """
    Main blacklist function. Creates the actual blacklist and dumps it.

    Args:
      path (str): Path to the pos/neg directories. Also dumps blacklist here
      make_histogram (str): If not empty, a histogram will be drawn using the given type
    """
    if path:
        path += "/"
    seqdict, filedict = load(path)
    l = find_collisions(seqdict, filedict)
    blacklist = set()
    for x in l:
        a_loc, a_name = x[0][0].split("/")[-2:]
        b_loc, b_name = x[1][0].split("/")[-2:]
        if x[0][1][0] > x[1][1][0]:  # This picks which file has a higher sum of nucleotides
            blacklist.add(b_name)   # and adds the smaller file to the blacklist
        else:
            blacklist.add(a_name)
    blacklist.add("416-60776-0-1.sto") # Incompatible with RNAz
    b.dumpfile(list(blacklist), f"{path}blacklist.json")
    print(f"{len(blacklist)} blacklisted files")
    if make_histogram:
        make_histograms(filedict, blacklist, make_histogram)
    return blacklist
Example #4
0
File: pig.py Project: smautner/pig
def getresults():
    """Analyzes the result files in rps_results and
    returns only the ones with the best best_esti_score in each fold.
    """
    results = defaultdict(lambda: [[0]])
    for rfile in os.listdir(f"{tmpdirectory}/task_results"):
        f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}")
        if f[1][0] > results[f[0]][0][0] or f[1][0] == -1:  # Remove this == -1 part
            # For each fold the result with the best best_esti_score is saved
            # If the best_esti_score is -1 it means a set classifier was used.
            results[f[0]] = f[1:]
    b.dumpfile(results, f"results/results.json")
Example #5
0
File: pig.py Project: smautner/pig
def load_pn_files(use_rnaz, use_filters, numneg, randseed, debug):
    fn = f"{tmpdirectory}/pn_{use_rnaz}_{use_filters}_{numneg}_{randseed}_{debug}.json"

    # If a file with the loaded files already exists, skip loadfiles.loaddata()
    if os.path.isfile(fn):
        p, n = b.loadfile(fn)  # pos, neg from loaded file
    else:
        if use_filters:
            p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz)
        else:
            p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz, 'both', blacklist_file="noblacklist")
        b.dumpfile((p, n), fn)
    return p, n
Example #6
0
File: pig.py Project: smautner/pig
def maketasks(p, n, fs_selection_methods, clfnames, n_folds, randseed, debug):
    """
    Creates and dumps tasks, dataframe and the folds created by kfold
    that are then read and executed by the cluster.

    Args:
      p (list): A list containing all the feature values
                for all the positive samples used.
      n (list): A list containing all the feature values
                for all the negative samples used.
      fs_selection_methods (dict): The dictionary of the Feature selection
                                   methods and their arguments.
      clfnames (list(string)): A list containing all the classifiernames used
                               for random parameter search or specific classifiers
                               to fit to.
      n_folds (int): Number of folds stratified K-Fold creates
      randseed (int): Randomseed used by the whole program
      debug (bool): Debug Mode (Might not actually do anything atm.)

    Returns:
      len(tasks): The total number of tasks.
    
    """
    tasks = []
    folds, df = makefolds(p, n, n_folds, randseed)  # numfolds = n_splits
    for foldnr in range(n_folds):
        for clfname in clfnames:
            for fstype, parameters in fs_selection_methods.items():
                for args in parameters:
                    if fstype == "Random":
                        num_features, num_random_tasks = args
                        for seed in range(num_random_tasks):  # Keep in mind this seed IS NOT randseed
                            tasks.append((foldnr, fstype, (num_features, seed), clfname, randseed))
                    elif fstype == "Forest" or fstype == "SVC1" or fstype == "SVC2":
                        tasks.append((foldnr, fstype, (randseed, args), clfname, randseed))
                    else:
                        tasks.append((foldnr, fstype, args, clfname, randseed))
    b.dumpfile(tasks, f"{tmpdirectory}/tasks.json")
    np.array(folds, dtype=object).dump(f"{tmpdirectory}/folds.pkl")
    df.to_pickle(f"{tmpdirectory}/dataframe.pkl")
    return len(tasks)
Example #7
0
def getresults2(numrandomtasks=10000, n_best=10, tmpdirectory="tmp"):
    """
    Calculates the average F1 scores random featurelist over every fold.
    Collects these into a histogram and dumps them into the results directory.
    Also takes the n_best Featurelists with the best average F1-Scores and dumps them too.

    Warning:
      This program will not work correctly if:
        - numrandomtasks is not the correct value
        - pig.py was executed with other feature selection methods than --random
        - pig.py was executed with more than one classifier

    Example:
      - Executed pig.py --random 40 10000 -n 7
      => Results in 70000 files with 40 features each.
      File 0-9999 beeing 10000 different featurelists for the 1. fold.
      Files 10000-19999 beeing the same 10000 featurelists for the 2. fold.
      ...
      - This code will take taskid % numrandomtasks and calculate the
        average F1 score for each of these featurelists over every fold.
      => Takes 7 files each and calculates their average F1 score
      => Results in 10000 scores in total.

    Args:
      numrandomtasks(int): Needs to be the same number as the 2. argument of
                           --random in pig.py, so the number of
                           random featurelists per fold.
      n_best(int): Number of best featurelists that should be saved seperately
      tmpdirectory(String): Location of the "tmp" directory.
    """
    score_d = defaultdict(list)
    avg_score_d = defaultdict(list)
    featurelist_d = defaultdict(list)

    i = 0
    j = 0
    for rfile in os.listdir(f"{tmpdirectory}/task_results"):
        taskid = int(rfile.split(".")[0])
        f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}")
        scores = f[1]
        fl = f[3]  # Featurelist
        tpr, precision = scores[2][0], scores[2][2]
        if np.isnan(precision):
            i += 1
        score_d[taskid % numrandomtasks].append((tpr, precision))
        # 10.000 Dictionary Entries mit 7 score tuples
        featurelist_d[taskid % numrandomtasks] = fl
        # 10.000 different Featurelists

    # Calculate average F1-Scores of each entry
    best_f1_score = 0
    best_key = 0
    f1_list = []  # Used for Histogram
    for key in score_d:
        sum_tpr, sum_precision = 0, 0
        for tpr, precision in score_d[key]:
            sum_tpr += tpr
            sum_precision += precision
        avg_tpr, avg_precision = sum_tpr / len(
            score_d[key]), sum_precision / len(score_d[key])
        f1 = 2 * ((avg_precision * avg_tpr) / (avg_precision + avg_tpr))
        if np.isnan(f1):
            j += 1
        f1_list.append(f1)
        avg_score_d[key] = f1

    # Get the best n_best featurelists
    best_featurelists = {}
    for key in dict(
            sorted(avg_score_d.items(), key=itemgetter(1),
                   reverse=True)[:n_best]).keys():
        best_featurelists[key] = (avg_score_d[key], featurelist_d[key])
    b.dumpfile(best_featurelists, f"results/best_featurelists.json")

    # Draw the histogram
    fontsize = 18  # Size of the text for the labels and legend.
    plt.figure(figsize=(12.8, 9.6))
    plt.xlabel("F1-Score", fontsize=fontsize)
    plt.ylabel("Number of Scores", fontsize=fontsize)
    plt.hist(f1_list, bins=100)
    plt.savefig("results/f1_histogram.png")