def gen_random_network_data(seed_graph, n_tries=10): log = get_logger() random_graph, tries = safe_perturbed_graph(seed_graph, n_tries) nodes = sorted(random_graph.nodes()) D = regularized_laplacian(random_graph, nodes, lam=0.05) C = rkhs_factor(D) return dict(G=random_graph, D=D, C=C, nodes=nodes), tries
def main(args): log = get_logger() log.info('Loading edgelist from: %s', args.edgelist) seed_network = \ util.simple_two_core(nx.read_edgelist(args.edgelist, encoding='ascii'), verbose=False) log.info('Generating network...') t_start = time() data, tries = gen_random_network_data(seed_network, args.n_tries) t_end = time() elapsed = t_end - t_start log.info('Random network generated in %.2f, in %d tries', elapsed, tries) log.info('Saving random network to: %s', args.output) joblib.dump(data, args.output)
def run(args): # Load the homologs geneAToHomologs, geneBToHomologs = load_homolog_mapping_file( args.homolog_mapping_file) # Load the genetic interactions A_GI, A_SL, A = load_genetic_interactions_table(args.species_A_table_file) B_GI, B_SL, B = load_genetic_interactions_table(args.species_B_table_file) logger = get_logger() if args.verbose > 0: logger.info('* Loaded genetic interactions...') logger.info('\t- %s interactions in species A (%s SLs)' % (len(A_GI), len(A))) logger.info('\t- %s interactions in species B (%s SLs)' % (len(B_GI), len(B))) # Make the predictions. We predict an SL between (u, v) in species B iff # Load and predict B_edges, B_true, B_pred = predict_sl_from_homolog_mapping( (A_GI, A_SL, A), (B_GI, B_SL, B), geneBToHomologs, args.verbose) # Compute PR and AUC if args.verbose > 0: logger.info('* Evaluating results...') logger.info('\tPrecision/TPR: %.3f' % precision_score(B_true, B_pred)) logger.info('\tRecall: %.3f' % recall_score(B_true, B_pred)) logger.info('\tFPR: %.3f' % compute_fpr(B_true, B_pred)) ################################################################################ # OUTPUT ################################################################################ # Output predictions valToName = {1: "SL", 0: "Non-SL"} items = [{ "Gene A": u, "Gene B": v, "Ground truth": valToName[gt], "Predicted": valToName[p] } for (u, v), gt, p in zip(B_edges, B_true, B_pred)] df = pd.DataFrame(items) df.to_csv(args.output_file, sep='\t', index=False)
def simplify_graph(G, verbose=True): ''' Returns the simple/strict graph corresponding to given graph (Removes self loops from G and returns largest connected component) ''' logger = get_logger() if (not nx.is_connected(G)): cc_list = list(nx.connected_component_subgraphs(G)) cc_sizes = [len(x) for x in cc_list] largest_cc = max(cc_sizes) cc_sizes.remove(largest_cc) if verbose: logger.warning('Network has %d connected components', len(cc_list)) logger.warning( '\tLargest is size %d and all the rest are %d or smaller', largest_cc, max(cc_sizes)) logger.warning('\tUsing largest connected component') G = max(cc_list, key=len) G.remove_edges_from(G.selfloop_edges()) return G
def simple_two_core(G, verbose=True): ''' Returns simple, 2 core of given graph ''' logger = get_logger() # Get simple graph G = simplify_graph(G, verbose) # Compute 2 core num_nodes = G.number_of_nodes() num_edges = G.number_of_edges() if verbose: logger.info('PPI info - # Nodes: %d, # Edges: %d', num_nodes, num_edges) logger.info('Computing 2 core') G = nx.k_core(G, 2) num_2core_nodes = G.number_of_nodes() num_2core_edges = G.number_of_edges() if verbose: logger.info('2 core info - # Nodes: %d, # Edges: %d', num_2core_nodes, num_2core_edges) logger.info('2 core removed %d nodes and %d edges', num_nodes - num_2core_nodes, num_edges - num_2core_edges) return G
def load_genetic_interactions_table(gi_table_file, ppi_file, sample_negs=False): logger = get_logger() GI = pd.read_csv(gi_table_file, sep='\t') # Explicitly filter out inconclusives... GI = GI[(GI['Category'] == 'SL') | (GI['Category'] == 'Non-SL')] SL = GI.loc[GI['Category'] == 'SL'] pairs = set(frozenset([u, v]) for u, v in zip(SL['Gene A'], SL['Gene B'])) # If sample negatives... if sample_negs: assert (len(GI) == len(SL)) nodes = set( munk.util.simple_two_core(nx.read_edgelist(ppi_file)).nodes()) non_sl_pairs = set() non_gi_data = [] while len(non_sl_pairs) != len(pairs): p = frozenset(random.sample(nodes, 2)) if p not in pairs and p not in non_sl_pairs: _p = tuple(p) non_sl_pairs.add(p) non_gi_data.append({ 'Category': 'Non-SL', 'Score': 1, 'Gene A': _p[0], 'Gene B': _p[1] }) logger.info('\tAdding %s non-SL pairs for %s total interactions' % (len(non_sl_pairs), len(pairs))) pairs = pairs | set(non_sl_pairs) GI = pd.concat([GI, pd.DataFrame(non_gi_data)]) SL = GI.loc[GI['Category'] == 'SL'] return GI, SL, pairs
def run(args): # Create logger logger = get_logger(args.verbosity) # Load the SLs logger.info('[Loading genetic interactions file]') df = pd.read_csv(args.gi_file, sep='\t', dtype={ 'Gene A': str, 'Gene B': str, 'Score': float }) pairToOutcome = dict( (frozenset([u, v]), c) for u, v, c in zip(df['Gene A'], df['Gene B'], df['Category']) if c != INCONCLUSIVE) if args.sinatra_featurize: # Load only SLs for SINaTRA features pairs = sorted(p for p, c in pairToOutcome.items() if c == SL) logger.info('- SINaTRA features: loaded %s SLs' % len(pairs)) else: # We include SLs and non-SLs pairs = sorted(p for p, c in pairToOutcome.items()) # Do some light reporting class_counter = Counter(pairToOutcome.values()) n_sl = class_counter[SL] n_non_sl = class_counter[NON_SL] logger.info('- Loaded %s interactions (%s SL and %s non-SL)' % (len(pairs), n_sl, n_non_sl)) # Load the embedding logger.info('[Loading MUNK embedding]') obj = joblib.load(args.embedding_file) embedding = obj.get('X') nodes = obj.get('nodes') landmark_nodes = set(obj.get('landmarks')) node_set = set(nodes) n_nodes = len(nodes) nodeToIndex = dict(zip(nodes, range(n_nodes))) logger.info('- %s nodes' % n_nodes) logger.info('- %s landmarks' % len(landmark_nodes)) # Restrict to pairs also in the network pairs = [ p for p in pairs if all([u in node_set for u in p]) and len(p) == 2 ] logger.info('- Restricting to %s pairs in the network' % len(pairs)) if args.remove_landmarks: pairs = [p for p in pairs if all([u not in landmark_nodes for u in p])] logger.info( '- Restricting to %s pairs in the network without landmarks' % len(pairs)) if args.sinatra_featurize: # For SINaTRA features, we randomly sample a set of # non-SLs from nodes in the network import random sl_pairs = list(pairs) non_sl_pairs = set() while len(non_sl_pairs) != len(sl_pairs): if args.remove_landmarks: p = frozenset(random.sample(node_set - landmark_nodes, 2)) else: p = frozenset(random.sample(nodes, 2)) if p not in sl_pairs and p not in non_sl_pairs: non_sl_pairs.add(p) pairToOutcome.update((p, NON_SL) for p in non_sl_pairs) pairs = sl_pairs + list(non_sl_pairs) logger.info('\tAdding %s non-SL pairs for %s total interactions' % (len(non_sl_pairs), len(pairs))) # Determine the feature function if args.feature_function == ADD_FEATURES: def feature_function(p): u, v = sorted(p) return embedding[nodeToIndex[u]] + embedding[nodeToIndex[v]] elif args.feature_function == MEAN_FEATURES: def feature_function(p): u, v = sorted(p) return (embedding[nodeToIndex[u]] + embedding[nodeToIndex[v]]) / 2. else: raise NotImplementedError('Feature function "%s" not implemented' % args.feature_function) # Construct the features and outcomes and output to file n_pairs = len(pairs) X = np.array([feature_function(p) for p in pairs]) y = np.array([pairToOutcome[p] == SL for p in pairs]) # Output to file output = dict(X=X, y=y, pairs=pairs, params=vars(args)) joblib.dump(output, args.output_file)
def main(args): log = get_logger() source_data_files = args.source_data_files target_data_files = args.target_data_files assert(len(source_data_files) == len(target_data_files)) log.info('Loading homologs list from %s', args.homolog_list) raw_homologs = util.read_homolog_list(args.homolog_list) log.info('Loading source edgelist from %s', args.source_edgelist) source_G = nx.read_edgelist(args.source_edgelist, encoding='ascii') log.info('Loading target edgelist from %s', args.target_edgelist) target_G = nx.read_edgelist(args.target_edgelist, encoding='ascii') log.info('Computing MUNK embeddings real PPI networks with %d landmarks', args.n_landmarks) n_landmarks = args.n_landmarks source_G = util.simple_two_core(source_G) target_G = util.simple_two_core(target_G) homologs = util.homologs_in_graphs(source_G, target_G, raw_homologs) source_nodes = sorted(source_G.nodes()) target_nodes = sorted(target_G.nodes()) source_D = munk.regularized_laplacian(source_G, source_nodes, args.lam) target_D = munk.regularized_laplacian(target_G, target_nodes, args.lam) source_C = munk.rkhs_factor(source_D) source_data = dict(C=source_C, nodes=source_nodes) target_data = dict(D=target_D, nodes=target_nodes) real_diff, other_mean, hom_mean = difference_in_means(source_data, target_data, homologs, n_landmarks, source_nodes, target_nodes) log.info('Difference between homolog and non-homolog similarity scores %f', real_diff) n_permutations = len(source_data_files) log.info('Loading %d pairs of graphs from disk... and computing differences in means...', n_permutations) means = Parallel(n_jobs=args.n_jobs)( delayed(difference_in_means_from_files) (source_data_fp, target_data_fp, homologs, n_landmarks, source_nodes, target_nodes) for source_data_fp, target_data_fp in zip(source_data_files, target_data_files)) errs = [1 for m in means if m is None] print('ERRORS:', len(errs)) means = [m for m in means if m is not None] rand_G_mean_diffs, rand_G_other_means, rand_G_hom_means = zip(*means) log.info('Mean difference in means between homologs and non-homologs scores for random graphs %f', np.mean(rand_G_mean_diffs)) p_val, n_less_than, n_observations = one_tail_pval(real_diff, rand_G_mean_diffs) log.info('N permutations less than real: %d, N permutations %d', n_less_than, n_observations) e_size = effect_size(real_diff, rand_G_mean_diffs) log.info('P-value: %f', p_val) log.info('Effect-size: %f', e_size) log.info('# permutations: %d', n_permutations) results = dict(pval=p_val, effect_size=e_size, n_permutations=n_observations, n_less_than=n_less_than, errs=len(errs)) log.info('Writing results to %s', args.output_file) with open(args.output_file, 'w') as OUT: json.dump(results, OUT, indent=2) diffs = dict( real=dict(mean_diff=real_diff, non_hom_mean=other_mean, hom_mean=hom_mean), random=dict(mean_diffs=rand_G_mean_diffs, non_hom_means=rand_G_other_means, hom_means=rand_G_hom_means)) log.info('Writing values of differences to %s', args.diffs_output_file) joblib.dump(diffs, args.diffs_output_file)
def run( args ): # Set up logging logger = get_logger(args.verbose) logger.info('* Loading input files...') # Load the homologs logger.info('\t- Loading homolog mapping...') geneAToHomologs, geneBToHomologs = \ load_homolog_mapping_file(args.homolog_mapping_file) # Load the genetic interactions logger.info('\t- Loading genetic interaction files...') if args.sample_negs: logger.info('\t- Sampling Non-SLs...') As, Bs = [], [] for species_A_file, species_A_ppi in \ zip(args.species_A_files, args.species_A_ppis): As.append(load_genetic_interactions_table( species_A_file, species_A_ppi, args.sample_negs)) for species_B_file, species_B_ppi in \ zip(args.species_B_files, args.species_B_ppis): Bs.append(load_genetic_interactions_table( species_B_file, species_B_ppi, args.sample_negs)) # Make predictions logger.info('* Making predictions for each pair of datasets...') items = [] for (A_name, (A_GI, A_SL, A)), (B_name, (B_GI, B_SL, B)) in \ product(zip(args.species_A_names, As), zip(args.species_B_names, Bs)): # Simple progress if args.verbose > 0: logger.info('\t- %s vs. %s' % (A_name, B_name)) # Predict B from A B_edges, B_true, B_pred = predict((A_GI, A_SL, A), (B_GI, B_SL, B), geneBToHomologs, args.verbose) items.append({ "Dataset A (Species)": '%s (%s)' % (A_name, args.species_names[0]), "Dataset B (Species)": '%s (%s)' % (B_name, args.species_names[1]), "Precision (True Positive Rate)": precision_score(B_true, B_pred), "Recall": recall_score(B_true, B_pred), "False Positive Rate": compute_fpr(B_true, B_pred), "F1 Score": f1_score(B_true, B_pred), }) # Predict A from B A_edges, A_true, A_pred = predict((B_GI, B_SL, B), (A_GI, A_SL, A), geneAToHomologs, args.verbose) items.append({ "Dataset A (Species)": '%s (%s)' % (B_name, args.species_names[1]), "Dataset B (Species)": '%s (%s)' % (A_name, args.species_names[0]), "Precision (True Positive Rate)": precision_score(A_true, A_pred), "Recall": recall_score(A_true, A_pred), "False Positive Rate": compute_fpr(A_true, A_pred), "F1 Score": f1_score(A_true, A_pred), }) # Output to file logger.info('* Outputting to file...') df = pd.DataFrame(items)[ ['Dataset A (Species)', 'Dataset B (Species)', 'Precision (True Positive Rate)', 'Recall', 'False Positive Rate', 'F1 Score'] ] df.to_csv(args.output_file, index=0, sep='\t')
# Add classifier choices with subparsers subparser = parser.add_subparsers(dest='classifier', help='Classifier') rf_parser = subparser.add_parser('rf') rf_parser.add_argument('-md', '--max_depth', type=int, default=None, required=False) rf_parser.add_argument('-nt', '--n_trees', type=int, nargs='*', required=False, default=[10, 100, 250, 500]) svm_parser = subparser.add_parser('svm') svm_parser.add_argument('-sc', '--svm_Cs', type=float, required=False, nargs='*', default=[0.01, 0.1, 1, 10, 100, 1000, 10000]) svm_parser.add_argument('-st', '--svm_tolerance', type=float, default=1e-3, required=False) args = parser.parse_args(sys.argv[1:]) # Set up logger logger = get_logger(args.verbosity) ################################################################################ # LOAD INPUT DATA ################################################################################ # Load features logger.info('[Loading features and genetic interactions]') a_data = joblib.load(args.feature_files[0]) b_data = joblib.load(args.feature_files[1]) X_A, y_A = np.array(a_data.get('X')), a_data.get('y') A_pairs, A_name = np.asarray(a_data.get('pairs')), args.names[0] X_B, y_B = np.array(b_data.get('X')), b_data.get('y') B_pairs, B_name = np.asarray(b_data.get('pairs')), args.names[1]