def balance_predictions(predictions_true, predictions_false, n_random_negative_folds=None, replicable=123):
    """
    n_random_negative_folds: Number of negative scores to be averaged to be assigned as negative instance. 
    If None calculated to cover as much as non-seed scores as possible
    """
    assert len(predictions_true) != len(predictions_false)
    swap = False
    if len(predictions_false) < len(predictions_true):
        swap = True
        predictions = predictions_true
        predictions_true = predictions_false
        predictions_false = predictions
    negative_sample_size = len(predictions_true)
    negative_scores = [0.0] * negative_sample_size
    n_fold = 0
    for sample in generate_samples_from_list_without_replacement(
        predictions_false, negative_sample_size, n_random_negative_folds, replicable=replicable
    ):
        if len(sample) < negative_sample_size:  # last fold
            continue
        n_fold += 1
        for i, val in enumerate(sample):
            negative_scores[i] += val
    predictions_false = map(lambda x: x / n_fold, negative_scores)
    if swap:
        return predictions_false, predictions_true
    return predictions_true, predictions_false
Exemple #2
0
def balance_predictions(predictions_true, predictions_false, n_random_negative_folds = None, replicable=123):
    """
    n_random_negative_folds: Number of negative scores to be averaged to be assigned as negative instance. 
    If None calculated to cover as much as non-seed scores as possible
    """
    assert len(predictions_true) != len(predictions_false)
    swap = False
    if len(predictions_false) < len(predictions_true):
	swap = True
	predictions = predictions_true
	predictions_true = predictions_false
	predictions_false = predictions
    negative_sample_size = len(predictions_true) 
    negative_scores = [ 0.0 ] * negative_sample_size
    n_fold = 0
    for sample in generate_samples_from_list_without_replacement(predictions_false, negative_sample_size, n_random_negative_folds, replicable = replicable):
	if len(sample) < negative_sample_size: # last fold
	    continue
	n_fold += 1
	for i, val in enumerate(sample):
	    negative_scores[i] += val
    predictions_false = map(lambda x: x/n_fold, negative_scores)
    if swap:
	return predictions_false, predictions_true
    return predictions_true, predictions_false
Exemple #3
0
def calculate_performance_metric_counts_using_random_negatives(node_to_score, setNodeTest, non_seeds, score_threshold, n_random_negative_folds = None, replicable=123):
    from selection_utilities import generate_samples_from_list_without_replacement

    (nTP, nFP, nFN, nTN) = (0.0, 0.0, 0.0, 0.0)
    for id, score in node_to_score.iteritems(): # if candidates based - for each candidate
        if id in setNodeTest: # in the initial association file
            if score >= score_threshold:
                nTP += 1
            else:
                nFN += 1

    if n_random_negative_folds == 0:
	for id, score in node_to_score.iteritems():
	    if id in non_seeds:
		if score >= score_threshold:
		    nFP += 1
		else:
		    nTN += 1
    else:
	n_actual_folds = 0
	for sample in generate_samples_from_list_without_replacement(non_seeds, len(setNodeTest), n_random_negative_folds, replicable = replicable):
	    setNegative = set(sample)
	    n_actual_folds += 1
	    for id, score in node_to_score.iteritems():
		if id in setNegative:
		    if score >= score_threshold:
			nFP += 1
		    else:
			nTN += 1
	nFP /= n_actual_folds
	nTN /= n_actual_folds
    return (nTP, nFP, nFN, nTN)
def calculate_performance_metric_counts_using_random_negatives(node_to_score, setNodeTest, non_seeds, score_threshold, n_random_negative_folds = None, replicable=123):
    from selection_utilities import generate_samples_from_list_without_replacement

    (nTP, nFP, nFN, nTN) = (0.0, 0.0, 0.0, 0.0)
    for id, score in node_to_score.iteritems(): # if candidates based - for each candidate
        if id in setNodeTest: # in the initial association file
            if score >= score_threshold:
                nTP += 1
            else:
                nFN += 1

    if n_random_negative_folds == 0:
	for id, score in node_to_score.iteritems():
	    if id in non_seeds:
		if score >= score_threshold:
		    nFP += 1
		else:
		    nTN += 1
    else:
	n_actual_folds = 0
	for sample in generate_samples_from_list_without_replacement(non_seeds, len(setNodeTest), n_random_negative_folds, replicable = replicable):
	    setNegative = set(sample)
	    n_actual_folds += 1
	    for id, score in node_to_score.iteritems():
		if id in setNegative:
		    if score >= score_threshold:
			nFP += 1
		    else:
			nTN += 1
	nFP /= n_actual_folds
	nTN /= n_actual_folds
    return (nTP, nFP, nFN, nTN)
def get_validation_node_scores_and_labels(
    file_result,
    file_seed_test_scores,
    file_node_scores,
    n_random_negative_folds=None,
    n_negatives=None,
    default_score=0,
    replicable=123,
    candidates_file=None,
    previous_negative_sample_size=None,
):
    """
	Returns a list of scores and labels [ ([0-1], [01]) ] for validation
	file_result: File to parse output scores 
	file_seed_test_scores: File to parse test seeds
	file_node_scores: File to parse all non seeds
	n_negatives: Number of negative instanaces
		     If None the same as number of test nodes
	n_random_negative_folds: Number of non-seed scores to be averaged to be assigned as negative instance
				 If None calculated to cover as much as non-seed scores as possible
				 If 0 all negative data is used
	default_score: All nodes that have a higher score than this score in file_node_scores will be considered as seeds
    """
    from guild_utilities import get_node_to_score, get_nodes

    node_to_score = get_node_to_score(file_result)
    test_nodes = get_nodes(file_seed_test_scores)
    initial_to_score = get_node_to_score(file_node_scores)
    non_seeds = set([node for node, score in initial_to_score.iteritems() if score == default_score])
    node_validation_data = [(node_to_score[node], 1) for node in test_nodes]

    if candidates_file is not None:
        candidates = get_nodes(candidates_file)
        node_to_score = dict([(node, node_to_score[node]) for node in candidates])
        non_seeds = list(non_seeds & candidates)

    if n_random_negative_folds == 0:
        negative_sample_size = None
        node_validation_data.extend([(node_to_score[node], 0) for node in set(node_to_score.keys()) & non_seeds])
    else:
        n_actual_folds = 0
        if n_negatives is None:
            n_negatives = len(test_nodes)
        negative_sample_size = n_negatives
        if previous_negative_sample_size is not None:
            if previous_negative_sample_size > negative_sample_size:
                negative_sample_size = previous_negative_sample_size
        negative_scores = [0] * negative_sample_size
        non_seeds = list(non_seeds)
        for sample in generate_samples_from_list_without_replacement(
            non_seeds, negative_sample_size, n_random_negative_folds, replicable=replicable
        ):
            for i, node in enumerate(sample):
                negative_scores[i] += node_to_score[node]
            n_actual_folds += 1
        node_validation_data.extend(map(lambda x: (x / n_actual_folds, 0), negative_scores))
    return node_validation_data, negative_sample_size
Exemple #6
0
def get_validation_node_scores_and_labels(file_result, file_seed_test_scores, file_node_scores, n_random_negative_folds = None, n_negatives = None, default_score = 0, replicable = 123, candidates_file = None, previous_negative_sample_size=None):
    """
	Returns a list of scores and labels [ ([0-1], [01]) ] for validation
	file_result: File to parse output scores 
	file_seed_test_scores: File to parse test seeds
	file_node_scores: File to parse all non seeds
	n_negatives: Number of negative instanaces
		     If None the same as number of test nodes
	n_random_negative_folds: Number of non-seed scores to be averaged to be assigned as negative instance
				 If None calculated to cover as much as non-seed scores as possible
				 If 0 all negative data is used
	default_score: All nodes that have a higher score than this score in file_node_scores will be considered as seeds
    """
    from guild_utilities import get_node_to_score, get_nodes

    node_to_score = get_node_to_score(file_result)
    test_nodes = get_nodes(file_seed_test_scores)
    initial_to_score = get_node_to_score(file_node_scores)
    non_seeds = set([ node for node, score in initial_to_score.iteritems() if score==default_score ])
    node_validation_data = [ (node_to_score[node], 1) for node in test_nodes ] 

    if candidates_file is not None:
	candidates = get_nodes(candidates_file)
	node_to_score = dict([ (node, node_to_score[node]) for node in candidates ])
	non_seeds = list(non_seeds & candidates)

    if n_random_negative_folds == 0:
	negative_sample_size = None
	node_validation_data.extend([(node_to_score[node], 0) for node in set(node_to_score.keys()) & non_seeds ])
    else:
	n_actual_folds = 0
	if n_negatives is None:
	    n_negatives = len(test_nodes)
	negative_sample_size = n_negatives 
	if previous_negative_sample_size is not None: 
	    if previous_negative_sample_size > negative_sample_size:
		negative_sample_size = previous_negative_sample_size 
	negative_scores = [ 0 ] * negative_sample_size 
	non_seeds = list(non_seeds)
	for sample in generate_samples_from_list_without_replacement(non_seeds, negative_sample_size, n_random_negative_folds, replicable = replicable):
	    for i, node in enumerate(sample):
		negative_scores[i] += node_to_score[node] 
	    n_actual_folds += 1
	node_validation_data.extend(map(lambda x: (x/n_actual_folds, 0), negative_scores))
    return node_validation_data, negative_sample_size