print "Number of nodes in network:%d" % network.number_of_nodes() ''' # Remove unannotated gene from network for node in network.nodes(): if not node in gene_annotation: network.remove_node(node) print "Number of nodes in network after removing unannotated genes:%d" % network.number_of_nodes() ''' # Remove individual nodes by get the largest indepedent connected component network = nx.connected_component_subgraphs(network)[0] print "Number of nodes in network after removing individual genes:%d" % network.number_of_nodes() print "Number of edges in network after removing individual genes:%d" % network.number_of_edges() network_annotated_gene = {} # cross validation set of annotated genes for gene in gene_annotation: if gene in network.nodes(): network_annotated_gene[gene] = gene_annotation[gene] print "Number of annotated genes in network:%d" % len(network_annotated_gene) sim_terms = utils.filtered_most_sim() sim_cache = utils.read_sim(config.simcache_fpath) wang_sim_cache = utils.read_sim(config.folder + "filtered_wang_sim.csv") #remove_predict() for GONUMBER in range(1, 11): cross_validation()
sim = compute_gene_sim_total(nterms, terms) #sim = compute_gene_sim_max(nterms, terms) non_sim_avg += sim non_sim_avg /= count print "%s, %f, %f" % (gene, sim_avg, non_sim_avg) if __name__ == "__main__": dag = DAG(config.go_fpath) gene_annotation = utils.get_annotation(config.annotation_fpath, config.filtered_annotation_fpath, dag.get_root().id) term_ic = utils.calculate_ic(gene_annotation, dag, config.ic_fpath) network = utils.create_network(config.network_fpath) # Remove unannotated gene from network for node in network.nodes(): if not node in gene_annotation: network.remove_node(node) # Remove individual nodes by get the largest indepedent connected component network = nx.connected_component_subgraphs(network)[0] sim_cache = utils.read_sim(config.simcache_fpath) #compute_term_in_neighbour_ratio() #compute_avg_term_num() compute_avg_sim()
def iterate_weighted_mv(network, annotated_genes, go_num): predicted_genes = {} iter = 0 last_sum = -1.0 ITERATION = 20 sim_cache = utils.read_sim("pfalciparum_data/modified_wang_sim.csv") while iter < ITERATION: total_sum = 0.0 for gene in network.nodes(): if not gene in annotated_genes: candidate_terms = get_candidate_terms(network, gene, annotated_genes, predicted_genes) cterm_sim_sum = {} for cterm in candidate_terms: sim_sum = 0.0 # For each neighbour of gene for neighbour in network.neighbors(gene): if neighbour in annotated_genes: max_sim = -1.0 for nterm in annotated_genes[neighbour]: new_sim = sim_cache[cterm][nterm] if new_sim > max_sim: max_sim = new_sim if gene in predicted_genes: weight = compute_gene_sim(predicted_genes[gene], annotated_genes[neighbour], sim_cache) else: weight = compute_gene_sim([cterm], annotated_genes[neighbour], sim_cache) sim_sum += 1.0 * weight * max_sim elif neighbour in predicted_genes: max_sim = -1.0 for nterm in predicted_genes[neighbour]: new_sim = sim_cache[cterm][nterm] if new_sim > max_sim: max_sim = new_sim if gene in predicted_genes: weight = compute_gene_sim(predicted_genes[gene], predicted_genes[neighbour], sim_cache) else: weight = compute_gene_sim([cterm], predicted_genes[neighbour], sim_cache) sim_sum += 1.0 * weight * max_sim cterm_sim_sum[cterm] = sim_sum if len(candidate_terms) > 0: # Select top go_num terms as predicted GO terms for the gene top_terms = heapq.nlargest(go_num, cterm_sim_sum.iteritems(), itemgetter(1)) if gene in predicted_genes: del predicted_genes[gene] predicted_genes[gene] = [] for rec in top_terms: predicted_genes[gene].append(rec[0]) total_sum = compute_total_sim(network, annotated_genes, predicted_genes, sim_cache) diff = int(total_sum) - int(last_sum) if diff==0: break else: last_sum = total_sum iter += 1 return predicted_genes