def balance_predictions(predictions_true, predictions_false, n_random_negative_folds=None, replicable=123): """ n_random_negative_folds: Number of negative scores to be averaged to be assigned as negative instance. If None calculated to cover as much as non-seed scores as possible """ assert len(predictions_true) != len(predictions_false) swap = False if len(predictions_false) < len(predictions_true): swap = True predictions = predictions_true predictions_true = predictions_false predictions_false = predictions negative_sample_size = len(predictions_true) negative_scores = [0.0] * negative_sample_size n_fold = 0 for sample in generate_samples_from_list_without_replacement( predictions_false, negative_sample_size, n_random_negative_folds, replicable=replicable ): if len(sample) < negative_sample_size: # last fold continue n_fold += 1 for i, val in enumerate(sample): negative_scores[i] += val predictions_false = map(lambda x: x / n_fold, negative_scores) if swap: return predictions_false, predictions_true return predictions_true, predictions_false
def balance_predictions(predictions_true, predictions_false, n_random_negative_folds = None, replicable=123): """ n_random_negative_folds: Number of negative scores to be averaged to be assigned as negative instance. If None calculated to cover as much as non-seed scores as possible """ assert len(predictions_true) != len(predictions_false) swap = False if len(predictions_false) < len(predictions_true): swap = True predictions = predictions_true predictions_true = predictions_false predictions_false = predictions negative_sample_size = len(predictions_true) negative_scores = [ 0.0 ] * negative_sample_size n_fold = 0 for sample in generate_samples_from_list_without_replacement(predictions_false, negative_sample_size, n_random_negative_folds, replicable = replicable): if len(sample) < negative_sample_size: # last fold continue n_fold += 1 for i, val in enumerate(sample): negative_scores[i] += val predictions_false = map(lambda x: x/n_fold, negative_scores) if swap: return predictions_false, predictions_true return predictions_true, predictions_false
def calculate_performance_metric_counts_using_random_negatives(node_to_score, setNodeTest, non_seeds, score_threshold, n_random_negative_folds = None, replicable=123): from selection_utilities import generate_samples_from_list_without_replacement (nTP, nFP, nFN, nTN) = (0.0, 0.0, 0.0, 0.0) for id, score in node_to_score.iteritems(): # if candidates based - for each candidate if id in setNodeTest: # in the initial association file if score >= score_threshold: nTP += 1 else: nFN += 1 if n_random_negative_folds == 0: for id, score in node_to_score.iteritems(): if id in non_seeds: if score >= score_threshold: nFP += 1 else: nTN += 1 else: n_actual_folds = 0 for sample in generate_samples_from_list_without_replacement(non_seeds, len(setNodeTest), n_random_negative_folds, replicable = replicable): setNegative = set(sample) n_actual_folds += 1 for id, score in node_to_score.iteritems(): if id in setNegative: if score >= score_threshold: nFP += 1 else: nTN += 1 nFP /= n_actual_folds nTN /= n_actual_folds return (nTP, nFP, nFN, nTN)
def get_validation_node_scores_and_labels( file_result, file_seed_test_scores, file_node_scores, n_random_negative_folds=None, n_negatives=None, default_score=0, replicable=123, candidates_file=None, previous_negative_sample_size=None, ): """ Returns a list of scores and labels [ ([0-1], [01]) ] for validation file_result: File to parse output scores file_seed_test_scores: File to parse test seeds file_node_scores: File to parse all non seeds n_negatives: Number of negative instanaces If None the same as number of test nodes n_random_negative_folds: Number of non-seed scores to be averaged to be assigned as negative instance If None calculated to cover as much as non-seed scores as possible If 0 all negative data is used default_score: All nodes that have a higher score than this score in file_node_scores will be considered as seeds """ from guild_utilities import get_node_to_score, get_nodes node_to_score = get_node_to_score(file_result) test_nodes = get_nodes(file_seed_test_scores) initial_to_score = get_node_to_score(file_node_scores) non_seeds = set([node for node, score in initial_to_score.iteritems() if score == default_score]) node_validation_data = [(node_to_score[node], 1) for node in test_nodes] if candidates_file is not None: candidates = get_nodes(candidates_file) node_to_score = dict([(node, node_to_score[node]) for node in candidates]) non_seeds = list(non_seeds & candidates) if n_random_negative_folds == 0: negative_sample_size = None node_validation_data.extend([(node_to_score[node], 0) for node in set(node_to_score.keys()) & non_seeds]) else: n_actual_folds = 0 if n_negatives is None: n_negatives = len(test_nodes) negative_sample_size = n_negatives if previous_negative_sample_size is not None: if previous_negative_sample_size > negative_sample_size: negative_sample_size = previous_negative_sample_size negative_scores = [0] * negative_sample_size non_seeds = list(non_seeds) for sample in generate_samples_from_list_without_replacement( non_seeds, negative_sample_size, n_random_negative_folds, replicable=replicable ): for i, node in enumerate(sample): negative_scores[i] += node_to_score[node] n_actual_folds += 1 node_validation_data.extend(map(lambda x: (x / n_actual_folds, 0), negative_scores)) return node_validation_data, negative_sample_size
def get_validation_node_scores_and_labels(file_result, file_seed_test_scores, file_node_scores, n_random_negative_folds = None, n_negatives = None, default_score = 0, replicable = 123, candidates_file = None, previous_negative_sample_size=None): """ Returns a list of scores and labels [ ([0-1], [01]) ] for validation file_result: File to parse output scores file_seed_test_scores: File to parse test seeds file_node_scores: File to parse all non seeds n_negatives: Number of negative instanaces If None the same as number of test nodes n_random_negative_folds: Number of non-seed scores to be averaged to be assigned as negative instance If None calculated to cover as much as non-seed scores as possible If 0 all negative data is used default_score: All nodes that have a higher score than this score in file_node_scores will be considered as seeds """ from guild_utilities import get_node_to_score, get_nodes node_to_score = get_node_to_score(file_result) test_nodes = get_nodes(file_seed_test_scores) initial_to_score = get_node_to_score(file_node_scores) non_seeds = set([ node for node, score in initial_to_score.iteritems() if score==default_score ]) node_validation_data = [ (node_to_score[node], 1) for node in test_nodes ] if candidates_file is not None: candidates = get_nodes(candidates_file) node_to_score = dict([ (node, node_to_score[node]) for node in candidates ]) non_seeds = list(non_seeds & candidates) if n_random_negative_folds == 0: negative_sample_size = None node_validation_data.extend([(node_to_score[node], 0) for node in set(node_to_score.keys()) & non_seeds ]) else: n_actual_folds = 0 if n_negatives is None: n_negatives = len(test_nodes) negative_sample_size = n_negatives if previous_negative_sample_size is not None: if previous_negative_sample_size > negative_sample_size: negative_sample_size = previous_negative_sample_size negative_scores = [ 0 ] * negative_sample_size non_seeds = list(non_seeds) for sample in generate_samples_from_list_without_replacement(non_seeds, negative_sample_size, n_random_negative_folds, replicable = replicable): for i, node in enumerate(sample): negative_scores[i] += node_to_score[node] n_actual_folds += 1 node_validation_data.extend(map(lambda x: (x/n_actual_folds, 0), negative_scores)) return node_validation_data, negative_sample_size