def split_cv_all_goterms(ann_obj, folds=5, seed=None, **kwargs): """ Split the positives and negatives into folds across all GO terms *seed*: the seed used by the random number generator when generating the folds. If None, the np.random RandomState will be used *returns*: a list of tuples containing the (train pos, train neg, test pos, test neg) """ ann_matrix = ann_obj.ann_matrix goids, prots = ann_obj.goids, ann_obj.prots print( "Splitting all annotations into %d folds by splitting each GO terms annotations into folds, and then combining them" % (folds)) # TODO there must be a better way to do this than getting the folds in each go term separately # but thi at least ensures that each GO term has evenly split annotations # list of tuples containing the (train pos, train neg, test pos, test neg) ann_matrix_folds = [] for i in range(folds): train_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float) test_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float) ann_matrix_folds.append((train_ann_mat, test_ann_mat)) for i in trange(ann_matrix.shape[0]): goid = goids[i] positives, negatives = alg_utils.get_goid_pos_neg(ann_matrix, i) # if there are less positives or negatives than there are folds, this will give an error if len(positives) < folds or len(negatives) < folds: continue # print("%d positives, %d negatives for goterm %s" % (len(positives), len(negatives), goid)) # split the set of positives and the set of negatives into K folds separately kf = KFold(n_splits=folds, shuffle=True, random_state=seed) kf.get_n_splits(positives) kf.get_n_splits(negatives) fold = 0 # now combine the positive and negative sets into a single array, and store it in the corresponding training or testing matrix for (pos_train_idx, pos_test_idx), (neg_train_idx, neg_test_idx) in zip(kf.split(positives), kf.split(negatives)): train_pos, test_pos = positives[pos_train_idx], positives[ pos_test_idx] train_neg, test_neg = negatives[neg_train_idx], negatives[ neg_test_idx] train_ann_mat, test_ann_mat = ann_matrix_folds[fold] fold += 1 # build an array of positive and negative assignments and set it in corresponding annotation matrix for pos, neg, mat in [(train_pos, train_neg, train_ann_mat), (test_pos, test_neg, test_ann_mat)]: pos_neg_arr = np.zeros(len(prots)) pos_neg_arr[list(pos)] = 1 pos_neg_arr[list(neg)] = -1 mat[i] = pos_neg_arr return ann_matrix_folds
def run_cv_all_goterms( alg_runners, ann_obj, folds=5, num_reps=1, cv_seed=None, **kwargs): """ Split the positives and negatives into folds across all GO terms and then run the algorithms on those folds. Algorithms are all run on the same split of data. *num_reps*: Number of times to repeat cross-validation. An output file will be written for each repeat *cv_seed*: Seed to use for the random number generator when splitting the annotations into folds If *num_reps* > 1, the seed will be incremented by 1 each time """ ann_matrix = ann_obj.ann_matrix goids, prots = ann_obj.goids, ann_obj.prots # set the cv_seed if specified # 2019-06-26 BUG: If there are a different number of terms, or the order of the terms changed, then the results won't be the same #if cv_seed is not None: # print("\nSetting the Random State seed to %d" % (cv_seed)) # np.random.seed(cv_seed) # first check to see if the algorithms have already been run # and if the results should be overwritten if kwargs['forcealg'] is True or len(goids) == 1: # runners_to_run is a list of runners for each repitition runners_to_run = {i: alg_runners for i in range(1,num_reps+1)} else: runners_to_run = {} # a different file is stored for each repitition, so check each one for rep in range(1,num_reps+1): curr_runners_to_run = [] curr_seed = cv_seed if curr_seed is not None: # add the current repitition number to the seed curr_seed += rep-1 for run_obj in alg_runners: out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % ( run_obj.out_dir, folds, rep, "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str) if os.path.isfile(out_file): print("%s already exists. Use --forcealg to overwite" % (out_file)) else: curr_runners_to_run.append(run_obj) runners_to_run[rep] = curr_runners_to_run # repeat the CV process the specified number of times for rep in range(1,num_reps+1): if len(runners_to_run[rep]) == 0: continue curr_seed = cv_seed if curr_seed is not None: # add the current repitition number to the seed curr_seed += rep-1 # split the annotation matrix into training and testing matrices K times ann_matrix_folds = split_cv_all_goterms(ann_obj, folds=folds, seed=curr_seed, **kwargs) for run_obj in runners_to_run[rep]: # because each fold contains a different set of positives, and combined they contain all positives, # store all of the prediction scores from each fold in a matrix combined_fold_scores = sparse.lil_matrix(ann_matrix.shape, dtype=np.float) for curr_fold, (train_ann_mat, test_ann_mat) in enumerate(ann_matrix_folds): print("* "*20) print("Fold %d" % (curr_fold+1)) # change the annotation matrix to the current fold curr_ann_obj = setup.Sparse_Annotations(train_ann_mat, goids, prots) # replace the ann_obj in the runner with the current fold's annotations run_obj.ann_obj = curr_ann_obj run_obj.train_mat = train_ann_mat run_obj.test_mat = test_ann_mat #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, net_obj, curr_ann_obj, **kwargs) # now setup the inputs for the runners run_obj.setupInputs() # run the alg run_obj.run() # parse the outputs. Only needed for the algs that write output files run_obj.setupOutputs() # store only the scores of the test (left out) positives and negatives for i in range(len(goids)): test_pos, test_neg = alg_utils.get_goid_pos_neg(test_ann_mat, i) curr_goid_scores = run_obj.goid_scores[i].toarray().flatten() curr_comb_scores = combined_fold_scores[i].toarray().flatten() curr_comb_scores[test_pos] = curr_goid_scores[test_pos] curr_comb_scores[test_neg] = curr_goid_scores[test_neg] combined_fold_scores[i] = curr_comb_scores # replace the goid_scores in the runner to combined_fold_scores to evaluate run_obj.goid_scores = combined_fold_scores #curr_goids = dag_goids if alg == 'birgrank' else goids # now evaluate the results and write to a file out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % ( run_obj.out_dir, folds, rep, "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str) utils.checkDir(os.path.dirname(out_file)) eval_utils.evaluate_ground_truth( run_obj, ann_obj, out_file, #non_pos_as_neg_eval=opts.non_pos_as_neg_eval, alg=run_obj.name, append=False, **kwargs) print("Finished running cross-validation") return
def weight_SWSN(ann_matrix, sparse_nets=None, normalized_nets=None, net_names=None, out_file=None, nodes=None, verbose=False): """ *normalized_nets*: list of networks stored as scipy sparse matrices. Should already be normalized """ # UPDATED: normalize the networks if sparse_nets is not None: print("Normalizing the networks") normalized_nets = [] for net in sparse_nets: normalized_nets.append(_net_normalize(net)) elif normalized_nets is None: print("No networks given. Nothing to do") return None, 0 if len(normalized_nets) == 1: print("Only one network given to weight_SWSN. Nothing to do.") total_time = 0 return sparse_nets[0], total_time if verbose: print("Removing rows with 0 annotations/positives") utils.print_memory_usage() # remove rows with 0 annotations/positives empty_rows = [] for i in range(ann_matrix.shape[0]): pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i) # the combineWeightsSWSN method doesn't seem to # work if there's only 1 positive if len(pos) <= 1 or len(neg) <= 1: empty_rows.append(i) # don't modify the original annotation matrix to keep the rows matching the GO ids curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows) if verbose: utils.print_memory_usage() print("Weighting networks for %d different GO terms" % (curr_ann_mat.shape[0])) print("Running simultaneous weights with specific negatives") start_time = time.process_time() alpha, indices = combineNetworksSWSN(curr_ann_mat, normalized_nets, verbose=verbose) # print out the computed weights for each network if net_names is not None: print("network weights:") #print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices]))) weights = defaultdict(int) for i in range(len(alpha)): weights[net_names[indices[i]]] = alpha[i] weights_table = ["%0.3e" % weights[net] for net in net_names] print('\t'.join(net_names)) print('\t'.join(weights_table)) # now add the networks together with the alpha weight applied weights_list = [0] * len(normalized_nets) weights_list[indices[0]] = alpha[0] combined_network = alpha[0] * normalized_nets[indices[0]] for i in range(1, len(alpha)): combined_network += alpha[i] * normalized_nets[indices[i]] weights_list[indices[i]] = alpha[i] total_time = time.process_time() - start_time if out_file is not None: # replace the .txt if present out_file = out_file.replace('.txt', '.npz') utils.checkDir(os.path.dirname(out_file)) print("\twriting combined network to %s" % (out_file)) sp.save_npz(out_file, combined_network) # also write the node ids so it's easier to access # TODO figure out a better way to store this node2idx_file = out_file + "-node-ids.txt" print("\twriting node ids to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join("%s\t%s\n" % (n, i) for i, n in enumerate(nodes))) # write the alpha/weight of the networks as well net_weight_file = out_file + "-net-weights.txt" print("\twriting network weights to %s" % (net_weight_file)) with open(net_weight_file, 'w') as out: out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i])) for i, idx in enumerate(indices))) return combined_network, total_time, weights_list
def run(run_obj): """ This script performs logistic regression by building a classifier for each term in the ontology """ params_results = run_obj.params_results P, alg, params = run_obj.P, run_obj.name, run_obj.params # get the labels matrix and transpose it to have label names as columns ann_mat = run_obj.ann_matrix max_iter = params['num_iter'] # see if train and test annotation matrices from the cross validation pipeline exist # if not, set train and test to the original annotation matrix itself if run_obj.train_mat is not None and run_obj.test_mat is not None: print("Performing cross validation") run_obj.cv = True train_mat = run_obj.train_mat test_mat = run_obj.test_mat else: run_obj.cv = False train_mat = ann_mat test_mat = ann_mat # stores the scores for all the terms scores = sparse.lil_matrix(ann_mat.shape, dtype=np.float) # dim: term x genes for term in tqdm(run_obj.goids_to_run): idx = run_obj.hpoidx[term] # compute the train gene indices of the annotations for the given label train_pos, train_neg = alg_utils.get_goid_pos_neg(train_mat, idx) train_set = sorted(list(set(train_pos) | set(train_neg))) if len(train_pos) == 0: print("Skipping term, 0 positive examples") continue if run_obj.cv: # if cross validation, then obtain the test gene set on which classifier should be tested test_pos, test_neg = alg_utils.get_goid_pos_neg(test_mat, idx) test_set = set(test_pos) | set(test_neg) test_set = sorted(list(test_set)) else: # set all unlabeled genes to the test set test_set = sorted( list(set(run_obj.protidx.values()) - set(train_set))) # obtain the feature vector only for the genes in the training set X_train = P[train_set, :] # obtain the feature vector only for the genes in the testing set X_test = P[test_set, :] # obtain the labels matrix corresponding to genes in the training set y_train = train_mat.transpose()[train_set, :] y_train = sparse.lil_matrix(y_train) # get the column of training data for the given label lab = y_train[:, idx].toarray().flatten() # now train the model on the constructed training data and the column of labels clf = logReg.training(X_train, lab, max_iter) # make predictions on the constructed training set predict = logReg.testing(clf, X_test) predict = predict.tolist() # get the current scores for the given label term in the current fold curr_score = scores[idx].toarray().flatten() # for the test indices of the current label, set the scores curr_score[test_set] = predict curr_score[train_pos] = 1 # add the scores produced by predicting on the current label of test set to a combined score matrix scores[idx] = curr_score run_obj.goid_scores = scores run_obj.params_results = params_results
def leave_out_taxon(t, ann_obj, species_to_uniprot_idx, eval_ann_obj=None, keep_ann=False, non_pos_as_neg_eval=False, eval_goterms_with_left_out_only=False, oracle=False, num_test_cutoff=10, **kwargs): """ Training positives are removed from testing positives, and train pos and neg are removed from test neg I don't remove training negatives from testing positives, because not all algorithms use negatives *t*: species to be left out. If t is None or 'all', then no species will be left out, and keep_ann must be True. *eval_ann_obj*: *eval_goterms_with_left_out_only*: if eval_ann_obj is given and keep_ann is False, only evaluate GO terms that have at least 2% of annotations. Useful to speed-up processing for term-based algorithms *oracle*: remove train negatives that are actually test positives *num_test_cutoff*: minimum number of annotations for each go term in the left-out species """ if t == "all": t = None # leave this taxon out by removing its annotations # rather than a dictionary, build a matrix ann_matrix, goids, prots = ann_obj.ann_matrix, ann_obj.goids, ann_obj.prots train_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float) test_ann_mat = sparse.lil_matrix(ann_matrix.shape, dtype=np.float) sp_goterms = [] #skipped_eval_no_left_out_ann = 0 for idx, goid in enumerate(goids): pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, idx) ann_pos = set(list(pos)) ann_neg = set(list(neg)) # first setup the training annotations (those used as positives/negatives for the algorithm) if keep_ann: train_pos = ann_pos train_neg = ann_neg else: train_pos = ann_pos - species_to_uniprot_idx[t] train_neg = ann_neg - species_to_uniprot_idx[t] eval_pos = ann_pos.copy() eval_neg = ann_neg.copy() # setup the testing annotations (those used when evaluating the performance) if eval_ann_obj is not None: if goid not in eval_ann_obj.goid2idx: eval_pos, eval_neg = set(), set() else: eval_pos, eval_neg = alg_utils.get_goid_pos_neg( eval_ann_obj.ann_matrix, eval_ann_obj.goid2idx[goid]) eval_pos = set(list(eval_pos)) eval_neg = set(list(eval_neg)) # if this species has little-to-no annotations that are being left-out, then we can skip it #if not keep_ann and eval_goterms_with_left_out_only: ## If the percentage of left-out ann is less than 2%, then skip it #if (len(ann_pos) - len(train_pos)) / float(len(train_pos)) < .02: # skipped_eval_no_left_out_ann += 1 # continue if t is None: test_pos = eval_pos test_neg = eval_neg if non_pos_as_neg_eval: # everything minus the positives test_neg = set(prots) - test_pos else: test_pos = eval_pos & species_to_uniprot_idx[t] # UPDATE 2018-06-27: Only evaluate the species prots as negatives, not all prots if non_pos_as_neg_eval: test_neg = species_to_uniprot_idx[t] - eval_pos test_neg.discard(None) else: test_neg = eval_neg & species_to_uniprot_idx[t] # UPDATE 2018-06-30: Remove test positives/negatives that are part of the training positives/negatives # don't remove test positives if its a training negative because not all algorithms use negatives test_pos -= train_pos if oracle: train_neg -= test_pos test_neg -= train_pos | train_neg # build an array of the scores and set it in the goid sparse matrix of scores # UPDATE 2019-07: Some algorithms are node-based and could benefit from the extra annotations pos_neg_arr = np.zeros(len(prots)) pos_neg_arr[list(train_pos)] = 1 pos_neg_arr[list(train_neg)] = -1 train_ann_mat[idx] = pos_neg_arr pos_neg_arr = np.zeros(len(prots)) pos_neg_arr[list(test_pos)] = 1 pos_neg_arr[list(test_neg)] = -1 test_ann_mat[idx] = pos_neg_arr # UPDATE 2018-10: Add a cutoff on both the # of training positive and # of test pos if len(train_pos) < num_test_cutoff or len(test_pos) < num_test_cutoff or \ (len(train_neg) == 0 or len(test_neg) == 0): continue sp_goterms.append(goid) #if eval_ann_matrix is not None and not keep_ann and eval_goterms_with_left_out_only: # print("\t%d goterms skipped_eval_no_left_out_ann (< 0.02 train ann in the left-out species)" % (skipped_eval_no_left_out_ann)) return train_ann_mat.tocsr(), test_ann_mat.tocsr(), sp_goterms
def weight_SWSN(ann_matrix, sparse_nets, net_names=None, out_file=None, nodes=None): """ TODO DOC """ if len(sparse_nets) == 1: print("Only one network given to weight_SWSN. Nothing to do.") total_time = 0 return sparse_nets[0], total_time # remove rows with 0 annotations/positives empty_rows = [] for i in range(ann_matrix.shape[0]): pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i) # the combineWeightsSWSN method doesn't seem to # work if there's only 1 positive if len(pos) <= 1 or len(neg) <= 1: empty_rows.append(i) # don't modify the original annotation matrix to keep the rows matching the GO ids curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows) # normalize the networks print("Normalizing the networks") normalized_nets = [] for net in sparse_nets: normalized_nets.append(_net_normalize(net)) print("Weighting networks for %d different GO terms" % (curr_ann_mat.shape[0])) print("Running simultaneous weights with specific negatives") start_time = time.process_time() alpha, indices = combineNetworksSWSN(curr_ann_mat, normalized_nets) if net_names is not None: print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices]))) # now add the networks together with the alpha weight applied combined_network = alpha[0] * sparse_nets[indices[0]] for i in range(1, len(alpha)): combined_network += alpha[i] * sparse_nets[indices[i]] total_time = time.process_time() - start_time if out_file is not None: # replace the .txt if present out_file = out_file.replace('.txt', '.npz') utils.checkDir(os.path.dirname(out_file)) print("\twriting combined network to %s" % (out_file)) sparse.save_npz(out_file, combined_network) # also write the node ids so it's easier to access # TODO figure out a better way to store this node2idx_file = out_file + "-node-ids.txt" print("\twriting node ids to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join("%s\t%s\n" % (n, i) for i, n in enumerate(nodes))) # write the alpha/weight of the networks as well net_weight_file = out_file + "-net-weights.txt" print("\twriting network weights to %s" % (net_weight_file)) with open(net_weight_file, 'w') as out: out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i])) for i, idx in enumerate(indices))) return combined_network, total_time
def setup_post_to_graphspace(config_map, selected_goid, alg='fastsinksource', name_postfix='', tags=None, taxon=None, goid_summary_file=None, num_neighbors=1, nodes_to_post=None, **kwargs): input_settings, alg_settings, \ output_settings, out_pref, kwargs = \ plot_utils.setup_variables( config_map, **kwargs) input_dir = input_settings['input_dir'] dataset = input_settings['datasets'][0] for arg in [ 'ssn_target_only', 'ssn_target_ann_only', 'ssn_only', 'string_target_only', 'string_nontarget_only', 'limit_to_taxons_file', 'add_target_taxon', 'oracle_weights', 'rem_neg_neighbors', 'youngs_neg', 'sp_leaf_terms_only' ]: kwargs[arg] = dataset.get(arg) uniprot_taxon_file = "%s/%s" % (input_dir, dataset['taxon_file']) # don't need it since we are re-running the alg anyway # # predictions file: # results_dir = "%s/%s/%s" % ( # output_settings['output_dir'], dataset['net_version'], dataset['exp_name']) # alg_params = alg_settings[alg] # combos = [dict(zip(alg_params.keys(), val)) # for val in itertools.product( # *(alg_params[param] for param in alg_params))] # # TODO allow for multiple # if len(combos) > 1: # print("%d combinations for %s. Using the first one" % (len(combos), alg)) # param_combo = combos[0] # # first get the parameter string for this runner # params_str = runner.get_runner_params_str(alg, dataset, param_combo) # prec_rec_str = "prec-rec%s-%s" % (taxon, selected_goid) # exp_type = 'loso' # pred_file = "%s/%s/%s%s%s%s.txt" % (results_dir, alg, exp_type, params_str, kwargs.get('postfix',''), prec_rec_str) # if not os.path.isfile(pred_file): # print("\tPredictions file not found: %s. Quitting" % (pred_file)) # sys.exit(1) # print("\treading %s" % (pred_file)) # df = pd.read_csv(pred_file, sep='\t') # print(df.head()) out_dir = "outputs/viz/graphspace/%s-%s/" % (dataset['net_version'].split( '/')[-1], dataset['exp_name'].split('/')[-1]) os.makedirs(out_dir, exist_ok=True) print("storing net and ann files to %s" % (out_dir)) # TODO allow posting without STRING net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx = \ load_net_ann_datasets( out_dir, taxon, dataset, input_settings, alg_settings, uniprot_taxon_file, **kwargs) W = new_net_obj.W prots = ann_obj.prots # also run the alg to get the full prediction scores # TODO get them from a file? alg_settings = {alg: alg_settings[alg]} alg_settings[alg]['should_run'] = [True] kwargs['verbose'] = True alg_runners = run_eval_algs.setup_runners(alg_settings, new_net_obj, ann_obj, output_settings['output_dir'], **kwargs) run_obj = alg_runners[0] run_obj.goids_to_run = [selected_goid] train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon( taxon, ann_obj, species_to_uniprot_idx, eval_ann_obj=eval_ann_obj, **kwargs) # now run the loso evaluation for this term, and get the scores back eval_loso.run_and_eval_algs(run_obj, ann_obj, train_ann_mat, test_ann_mat, taxon=taxon, **kwargs) term_scores = np.ravel( run_obj.goid_scores[ann_obj.goid2idx[selected_goid]].toarray()) print("top 10 scores for %s, %s:" % (taxon, selected_goid)) taxon_prots_idx = list(species_to_uniprot_idx[taxon]) taxon_prots = [prots[i] for i in taxon_prots_idx] taxon_term_scores = term_scores[taxon_prots_idx] print('\n'.join(["%s\t%0.4e" % ( ann_obj.prots[taxon_prots_idx[i]], taxon_term_scores[i]) \ for i in np.argsort(taxon_term_scores)[::-1][:10]])) pos_neg_file = "%s/%s" % (input_dir, dataset['pos_neg_file']) #selected_goid = "15643" # toxic substance binding #selected_goid = "9405" # pathogenesis #selected_goid = "98754" # detoxification selected_goname = None # build a dictionary of the evidencecode for each prot uniprot_to_evidencecode = defaultdict(set) annotated_prots = set() neg_prots = set() if goid_summary_file is None: goid_summary_file = pos_neg_file.replace("bp-", '').replace("mf-", '') if '-list' in pos_neg_file: goid_summary_file = goid_summary_file.replace( "-list", "-summary-stats") elif '.gz' in pos_neg_file: goid_summary_file = goid_summary_file.replace( ".tsv.gz", "-summary-stats.tsv") else: goid_summary_file = goid_summary_file.replace( ".tsv", "-summary-stats.tsv") df_summary = pd.read_csv(goid_summary_file, sep='\t') goid_names = dict(zip(df_summary['GO term'], df_summary['GO term name'])) #goid_num_anno = dict(zip(df_summary['GO term'], df_summary['# positive examples'])) print("GO name: %s" % (goid_names[selected_goid])) selected_goname = goid_names[selected_goid].replace(' ', '-')[0:20] # load the GAIN propagation to get the evidence code ev_codes_file = dataset.get('ev_codes_file') if ev_codes_file is not None: for orf, goid, goname, hierarchy, evidencecode, annotation_type in utils.readColumns( ev_codes_file, 1, 2, 3, 4, 5, 6): if selected_goid[:3] == "GO:": goid = "GO:" + "0" * (7 - len(goid)) + goid if goid != selected_goid: continue selected_goname = goname.replace(' ', '-')[0:20] if annotation_type != '1': continue uniprot_to_evidencecode[orf].add(evidencecode) # limit it to the current taxon if taxon is not None: print("Getting species of each prot from %s" % (uniprot_taxon_file)) #print("Limiting the prots to those for taxon %s (%s)" % (taxon, selected_species[taxon])) print("Limiting the prots to those for taxon %s" % (taxon)) # for each of the 19 species, leave out their annotations # and see how well we can retrieve them uniprot_to_species = utils.readDict(uniprot_taxon_file, 1, 2) if taxon not in species_to_uniprot_idx: print("Error: taxon ID '%d' not found" % (taxon)) sys.exit() # also limit the proteins to those in the network print("\t%d prots for taxon %s." % (len(taxon_prots_idx), taxon)) goid_idx = ann_obj.goid2idx[selected_goid] pos, neg = alg_utils.get_goid_pos_neg(train_ann_mat, goid_idx) non_taxon_annotated_prots = set([prots[i] for i in pos]) non_taxon_neg_prots = set([prots[i] for i in neg]) print("\t%d non-taxon pos, %d non-taxon neg" % (len(non_taxon_annotated_prots), len(non_taxon_neg_prots))) pos, neg = alg_utils.get_goid_pos_neg(test_ann_mat, goid_idx) annotated_prots = set([prots[i] for i in pos]) neg_prots = set([prots[i] for i in neg]) print("\t%d taxon pos, %d taxon neg" % (len(annotated_prots), len(neg_prots))) print("\t%d annotated prots for %s (%s)" % (len(annotated_prots), selected_goname, selected_goid)) #conf_cutoff = 0.2 conf_cutoff = -1 predicted_prots = set() ranks = {} scores = {} first_zero_rank = None for i, idx in enumerate(np.argsort(taxon_term_scores)[::-1]): rank = i + 1 prot = prots[taxon_prots_idx[idx]] predicted_prots.add(prot) score = taxon_term_scores[idx] scores[prot] = score if taxon is not None: ranks[prot] = rank if score == 0 and first_zero_rank is None: first_zero_rank = rank else: ranks[prot] = rank # move the score between 0 and 1 if it's genemania (normally between -1 and 1) # as the score is used to set the opacity # TODO fix genemania #if alg == "genemania": # pred_cut_conf[gene] = local_conf # local_conf = ((float(local_conf) - -1) / float(1--1)) * (1-0) + 0 #pred_local_conf[gene] = local_conf print("\t%d prots with a score" % (len(taxon_term_scores))) print("Rank of first zero score: %d" % (first_zero_rank)) print("Ranks of left-out positives:") for gene in sorted(annotated_prots, key=ranks.get): print("%s\t%d" % (gene, ranks[gene])) print("Including top 30 ranked-proteins of left-out species") top_30 = sorted(set(taxon_prots) & set(ranks.keys()), key=ranks.get)[:30] if ev_codes_file is not None: print("Evidence codes of top 30:") for i, gene in enumerate(top_30): if gene in uniprot_to_evidencecode: print("%s\t%s\t%s" % (i, gene, uniprot_to_evidencecode[gene])) top_30 = set(top_30) if taxon is not None: print( "Getting the induced subgraph of the neighbors of the %d annotated nodes" % (len(annotated_prots))) prededges = set() if nodes_to_post is not None: print("Getting neighbors of %s" % (', '.join(nodes_to_post))) nodes_to_add_neighbors = set(nodes_to_post) else: nodes_to_add_neighbors = annotated_prots.copy() | top_30 node2idx = ann_obj.node2idx for i in range(opts.num_neighbors): #print("Adding neighbors %d" % (i+1)) curr_nodes_to_add_neighbors = nodes_to_add_neighbors.copy() nodes_to_add_neighbors = set() print("adding %sneighbors of %d nodes" % ("positive ", len(curr_nodes_to_add_neighbors))) for u in curr_nodes_to_add_neighbors: #neighbors = set(nx.all_neighbors(G, u)) neighbors = set( [prots[v] for v in get_mat_neighbors(W, node2idx[u])]) if opts.node_to_post is None: # UPDATE 2018-10: try adding just the positive neighbors of the node # TODO make this a command-line option neighbors = neighbors & (non_taxon_annotated_prots | annotated_prots | top_30) #if len(neighbors) > 15 and nodes_to_post is None: # print("\tskipping adding neighbors of %s. len(neighbors): %d" % (u, len(neighbors))) # continue nodes_to_add_neighbors.update(neighbors) prededges.update(set([(u, v) for v in neighbors])) else: print( "Getting the induced subgraph of the %d annotated and %d predicted proteins" % (len(annotated_prots), len(predicted_prots))) print("not yet implemented. quitting") sys.exit() # prededges = set(G.subgraph(annotated_prots.union(predicted_prots)).edges()) prededges = set([tuple(sorted((u, v))) for u, v in prededges]) # TODO I should also show the disconnected nodes prednodes = set([n for edge in prededges for n in edge]) print("\t%d nodes, %d edges" % (len(prednodes), len(prededges))) if len(prededges) > 1000 or len(prednodes) > 500: print("\nToo many nodes/edges. Not posting to GraphSpace. Quitting") sys.exit() #graph_attr_file = "" #graph_attr, attr_desc = readGraphAttr() # add the edge weight from the network to attr_desc which will be used for the popup # set the edges as the neighbors of the annotated genes #prededges = set() # get the induced subgraph of the annotated nodes and predicted nodes #for n in func_prots: # if not G.has_node(n): # continue # for neighbor in G.neighbors(n): # prededges.add((n, neighbor)) graph_attr = {n: {} for n in prednodes} attr_desc = {n: {} for n in prednodes} print("Reading gene names and species for each protein from %s" % (uniprot_taxon_file)) #prot_species = utils.readDict(uniprot_taxon_file, 1, 2) uniprot_to_gene = utils.readDict(uniprot_taxon_file, 1, 4) # there can be multiple gene names. Just show the first one for now uniprot_to_gene = { n: gene.split(' ')[0] for n, gene in uniprot_to_gene.items() } node_labels = {} print("building graphspace object") # get the abbreviation of the species names species_names, net_taxons = eval_loso.get_selected_species( species_to_uniprot_idx, kwargs['limit_to_taxons_file']) sp_abbrv = { t: ''.join(subs[0] for subs in sp_name.split(' ')[:2]) for t, sp_name in species_names.items() } # for each node, add the prediction values for n in tqdm(prednodes): # set the name of the node to be the gene name and add the k to the label gene_name = uniprot_to_gene.get(n, n) curr_taxon = uniprot_to_species[n] species_short_name = sp_abbrv[curr_taxon] # add the species to the front of the gene name label = "%s-%s" % (species_short_name, gene_name) uniprot_to_gene[n] = label #node_labels[n] = "%s\n%d" % (label, min(ranks[n], 43)) if n in annotated_prots else label node_labels[n] = "%s\n%d" % ( label, ranks[n] if ranks[n] < first_zero_rank else first_zero_rank) if n in taxon_prots else label # maybe put the labels below the nodes? # helps with visualizing the background opacity graph_attr[n]['text-valign'] = 'bottom' # add the strain name to the popup attr_desc[n]['Strain'] = species_names[curr_taxon] if n in predicted_prots: # don't need to normalize because the confidence values are already between 0 and 1 if taxon and (n in non_taxon_annotated_prots or n in non_taxon_neg_prots): pass else: # UPDATE: use the node rank instead of the node score #graph_attr[n]['background-opacity'] = pred_local_conf[n] if n not in ranks: graph_attr[n]['background-opacity'] = scores[n] else: #graph_attr[n]['background-opacity'] = scores[n] graph_attr[n]['background-opacity'] = max([ 0.9 - (ranks[n] / float(first_zero_rank)), float(scores[n]) ]) attr_desc[n]["%s rank" % (alg_names[alg])] = ranks[n] attr_desc[n]["%s prediction score" % (alg_names[alg])] = "%0.4f" % (scores[n]) #elif n in annotated_prots or (taxon and (n in non_taxon_annotated_prots or n in non_taxon_neg_prots)) \ # or n in neg_prots: #if n in pred_local_conf: # graph_attr[n]['background-opacity'] = pred_local_conf[n] # attr_desc[n]["Local prediction confidence"] = pred_local_conf[n] # also add the annotation to the popup if n in uniprot_to_evidencecode: codes = uniprot_to_evidencecode[n] # TODO add bullet points to the list #attr_desc[n]["Evidence code"] = ''.join(["%s (%s)\n" % (c, evidence_code_name[c]) for c in codes]) # order it by exp, comp, then elec evidence_codes = ''.join([ "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes if evidence_code_type[c] == 'experimental' ]) evidence_codes += ''.join([ "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes if evidence_code_type[c] == 'computational' ]) evidence_codes += ''.join([ "<li>%s (%s)</li>" % (c, evidence_code_name[c]) for c in codes if evidence_code_type[c] == 'electronic' ]) attr_desc[n]["Evidence code"] = "<ul>%s</ul>" % (evidence_codes) # set the width of the edges by the network weight edge_weights = defaultdict(float) for u, v in tqdm(prededges): e = (u, v) if e not in attr_desc: attr_desc[e] = {} if e not in graph_attr: graph_attr[e] = {} #attr_desc[e]["edge weight"] = G.adj[u][v]]['weight'] if net_obj.multi_net: #attr_desc[e]["Final edge weight"] = "%0.1f" % (W[node2idx[u]][:,node2idx[v]].A.flatten()[0]) edge_type_weights = [] # add the weights for the individual string networks for i in range(len(net_obj.net_names)): net_name = net_obj.net_names[i] net_name = "SSN (E-value <= 0.1)" if 'eval-e0_1' in net_name else net_name net = net_obj.sparse_networks[i] w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0] if w != 0: #attr_desc[e][net_name] = "%0.1f" % (w) edge_type_weights.append("<li>%s: %0.1f</li>" % (net_name, w)) edge_weights[e] += w * net_obj.swsn_weights[i] attr_desc[e]["Edge weights by type"] = "<ul>%s</ul>" % (''.join( sorted(edge_type_weights))) else: attr_desc[e]["Edge weight"] = "%0.1f" % ( W[node2idx[u]][:, node2idx[v]].A.flatten()[0]) # make the edges somewhat opaque for a better visual style graph_attr[e]['opacity'] = 0.7 # set the width of the edges by the network weight #edge_weights = {(u,v): float(W[node2idx[u]][:,node2idx[v]].A.flatten()[0]) for u,v in prededges} for e, w in edge_weights.items(): attr_desc[e]["Final edge weight"] = "%0.1f" % (w) # TODO set the min and max as parameters or something #max_weight = 180 if net_obj.multi_net: max_weight = net_obj.swsn_weights[0] * 180 print(max_weight) else: max_weight = 180 for e in edge_weights: if edge_weights[e] > max_weight: edge_weights[e] = max_weight graph_attr = gs.set_edge_width(prededges, edge_weights, graph_attr, a=1, b=12, min_weight=1, max_weight=max_weight) H = nx.Graph() H.add_edges_from(prededges) # see which DB the edge came from to set the edge color print("Getting the edge type from networks") if net_obj.multi_net: print("\tFrom both STRING and SEQ_SIM") seq_sim_edges = set() for u, v in prededges: # get the SSN weight of this edge. Should be the first network net = net_obj.sparse_networks[0] w = net[node2idx[u]][:, node2idx[v]].A.flatten()[0] if w != 0: # these are all undirected, so just store the sorted version u, v = tuple(sorted((u, v))) # give these the default color graph_attr[(u, v)]['color'] = edge_type_color['default'] seq_sim_edges.add((u, v)) # string_edges = set() # temp_version = '2017_10-string' # net = f_settings.NETWORK_template % (temp_version, temp_version) # for u,v in utils.readColumns(net, 1, 2): # #if (u,v) not in prededges: # if not H.has_edge(u,v): # continue # # give these the default color # u,v = tuple(sorted((u,v))) # graph_attr[(u,v)]['color'] = edge_type_color['string'] # string_edges.add((u,v)) string_edges = prededges.difference(seq_sim_edges) print("\t%d edges from seq-sim, %d edges from STRING" % (len(seq_sim_edges), len(string_edges))) # set the color to STRING if it didn't come from sequence similarity for e in string_edges: #if 'color' not in graph_attr[e]: graph_attr[e]['color'] = edge_type_color['string'] #elif 'STRING' in f_settings.NETWORK_VERSION_INPUTS[version]: # for e in graph_attr: # graph_attr[e]['color'] = edge_type_color['string'] else: for e in graph_attr: graph_attr[e]['color'] = edge_type_color['default'] # apply the evidence code style to each protein for n in prednodes: if n in annotated_prots: graph_attr[n]['color'] = node_type_color['annotation'] elif taxon and n in non_taxon_annotated_prots: graph_attr[n]['color'] = node_type_color['non-taxon-annotation'] elif taxon and n in non_taxon_neg_prots: graph_attr[n]['color'] = node_type_color[ 'non-taxon-neg-annotation'] elif n in neg_prots: graph_attr[n]['color'] = node_type_color['neg-annotation'] elif n in predicted_prots: graph_attr[n]['color'] = node_type_color['prediction'] if n in uniprot_to_evidencecode: curr_style = "" for evidencecode in uniprot_to_evidencecode[n]: curr_type = evidence_code_type[evidencecode] if curr_type == "experimental": curr_style = annotation_type_styles[curr_type] break elif curr_style == "computational": continue else: curr_style = annotation_type_styles[curr_type] graph_attr[n].update(curr_style) # temporary fix to get the non-target positive examples if n in non_taxon_annotated_prots: graph_attr[n].update(annotation_type_styles['experimental']) # TODO build the popups here. That way the popup building logic can be separated from the # GSGraph building logic popups = {} prednodes = set([n for edge in prededges for n in edge]) for n in prednodes: popups[n] = gs.buildNodePopup(n, attr_val=attr_desc) for u, v in prededges: popups[(u, v)] = gs.buildEdgePopup(u, v, node_labels=uniprot_to_gene, attr_val=attr_desc) # Now post to graphspace! print("Building GraphSpace graph") G = gs.constructGraph(prededges, node_labels=node_labels, graph_attr=graph_attr, popups=popups) # TODO add an option to build the 'graph information' tab legend/info # build the 'Graph Information' metadata #desc = gs.buildGraphDescription(opts.edges, opts.net) desc = '' metadata = {'description': desc, 'tags': [], 'title': ''} if tags is not None: metadata['tags'] = tags G.set_data(metadata) if 'graph_exp_name' in dataset: graph_exp_name = dataset['graph_exp_name'] else: graph_exp_name = "%s-%s" % (dataset['exp_name'].split('/')[-1], dataset['net_version'].split('/')[-1]) graph_name = "%s-%s-%s-%s%s" % (selected_goname, selected_goid, alg, graph_exp_name, name_postfix) G.set_name(graph_name) # rather than call it from here and repeat all the options, return G, and then call this after #post_graph_to_graphspace(G, opts.username, opts.password, opts.graph_name, apply_layout=opts.apply_layout, layout_name=opts.layout_name, # group=opts.group, make_public=opts.make_public) return G, graph_name
def run(run_obj): params_results = run_obj.params_results P, alg, params = run_obj.P, run_obj.name, run_obj.params # get the labels matrix and transpose it to have label names as columns ann_mat = run_obj.ann_matrix max_iter = params['num_iter'] if run_obj.train_mat is not None and run_obj.test_mat is not None: print("Performing Cross validation") run_obj.cv = True train_mat = run_obj.train_mat test_mat = run_obj.test_mat else: run_obj.cv = False train_mat = ann_mat test_mat = ann_mat # stores the scores for all the terms scores = sparse.lil_matrix(ann_mat.shape, dtype=np.float) # dim: term x genes for term in tqdm(run_obj.goids_to_run): idx = run_obj.hpoidx[term] # get the training positive, negative sets for current fold train_pos, train_neg = alg_utils.get_goid_pos_neg(train_mat, idx) train_set = sorted(list(set(train_pos) | set(train_neg))) if len(train_pos) == 0: print("Skipping term, 0 positive examples") continue if run_obj.cv: # if cross validation, then obtain the test gene set on which classifier should be tested test_pos, test_neg = alg_utils.get_goid_pos_neg(test_mat, idx) test_set = set(test_pos) | set(test_neg) test_set = sorted(list(test_set)) else: # set all unlabeled genes to the test set test_set = sorted( list(set(run_obj.protidx.values()) - set(train_set))) # obtain the feature vector only for the genes in the training set X_train = P[train_set, :] # obtain the feature vector only for the genes in the testing set X_test = P[test_set, :] # obtain the labels matrix corresponding to genes in the training set y_train = train_mat.transpose()[train_set, :] y_train = sparse.lil_matrix(y_train) classifier = svm.training(X_train, y_train[:, idx].toarray().flatten(), max_iter) score_testSet = svm.testing(classifier, X_test) predict = score_testSet.tolist() # get the current scores for the given term in current fold curr_score = scores[idx].toarray().flatten() # for the test indices of the current label, set the scores curr_score[test_set] = predict curr_score[train_pos] = 1 # add the scores produced by predicting on the current label of test set to a combined score matrix scores[idx] = curr_score run_obj.goid_scores = scores run_obj.params_results = params_results
def run(run_obj): """ Function to run FastSinkSource, FastSinkSourcePlus, Local and LocalPlus *goids_to_run*: goids for which to run the method. Must be a subset of the goids present in the ann_obj """ params_results = run_obj.params_results # make sure the goid_scores matrix is reset # because if it isn't empty, overwriting the stored scores seems to be time consuming goid_scores = sp.lil_matrix(run_obj.ann_matrix.shape, dtype=np.float) goid_rank_stats = {} P, alg, params = run_obj.P, run_obj.name, run_obj.params print("Running %s with these parameters: %s" % (alg, params)) # run FastSinkSource on each GO term individually #for i in trange(run_obj.ann_matrix.shape[0]): #goid = run_obj.goids[i] for goid in tqdm(run_obj.goids_to_run): idx = run_obj.ann_obj.goid2idx[goid] # get the row corresponding to the current goids annotations y = run_obj.ann_matrix[idx, :] positives = (y > 0).nonzero()[1] negatives = (y < 0).nonzero()[1] # if this method uses positive examples only, then remove the negative examples if alg in ["fastsinksourceplus", "sinksourceplus", "localplus"]: negatives = None if run_obj.net_obj.weight_gmw is True: start_time = time.process_time() # weight the network for each GO term individually W, process_time = run_obj.net_obj.weight_GMW(y.toarray()[0], goid) P = alg_utils.normalizeGraphEdgeWeights(W, ss_lambda=params.get( 'lambda', None)) params_results['%s_weight_time' % (alg)] += time.process_time() - start_time a, max_iters = params['alpha'], params['max_iters'] compare_ranks = params['compare_ranks'] # rank_all is a T/F option, but 'rank_pos_neg' will be the test/left-out ann matrix # from which we can get the left-out pos/neg for this term rank_all, rank_pos_neg = params['rank_all'], params['rank_pos_neg'] if sp.issparse(rank_pos_neg): pos, neg = alg_utils.get_goid_pos_neg(rank_pos_neg, idx) rank_pos_neg = (set(pos), set(neg)) elif rank_pos_neg is True: print("ERROR: rank_pos_neg must be the test_ann_mat") sys.exit() # now actually run the algorithm ss_obj = ss_bounds.SinkSourceBounds(P, positives, negatives=negatives, max_iters=max_iters, a=a, rank_all=rank_all, rank_pos_neg=rank_pos_neg, verbose=run_obj.kwargs.get( 'verbose', False)) scores_arr = ss_obj.runSinkSourceBounds() process_time, update_time, iters, comp = ss_obj.get_stats() if run_obj.kwargs.get('verbose', False) is True: tqdm.write("\t%s converged after %d iterations " % (alg, iters) + "(%0.4f sec) for %s" % (process_time, goid)) if compare_ranks: # compare how long it takes for the ranks to match the previous run tqdm.write( "\tRepeating the run, but comparing the ranks from the previous run at each iteration" ) # keep only the nodes with a non-zero score scores = {n: s for n, s in enumerate(scores_arr) if s > 0} # ranks is a list containing the ranked order of nodes. # The node with the highest score is first, the lowest is last if rank_pos_neg is not None: pos_neg_nodes = rank_pos_neg[0] | rank_pos_neg[1] ranks = [ n for n in sorted(set(scores.keys()) & pos_neg_nodes, key=scores.get, reverse=True) ] else: ranks = [ n for n in sorted(scores, key=scores.get, reverse=True) ] # left off top-k for now #ranks = ranks[:k] if self.rank_topk is True else ranks ss_obj = ss_bounds.SinkSourceBounds(P, positives, negatives=negatives, max_iters=max_iters, a=a, rank_all=rank_all, rank_pos_neg=rank_pos_neg, ranks_to_compare=ranks, verbose=run_obj.kwargs.get( 'verbose', False)) ss_obj.runSinkSourceBounds() rank_stats = [ "%d\t%d\t%0.4e\t%d\t%d\t%0.2e\t%0.2e\t%0.4f\t%0.4f\t%0.4f" % (len(positives), i + 1, ss_obj.kendalltau_list[i], ss_obj.num_unranked_list[i], ss_obj.max_unranked_stretch_list[i], ss_obj.max_d_list[i], ss_obj.UB_list[i], ss_obj.eval_stats_list[i][0], ss_obj.eval_stats_list[i][1], ss_obj.eval_stats_list[i][2]) for i in range(ss_obj.num_iters) ] goid_rank_stats[goid] = rank_stats #rank_fh.write(''.join("%s%s\t%d\t%d\t%0.6f\t%d\t%d\t%0.4e\t%0.4e\t%0.4f\t%0.4f\t%0.4f\t%0.4f\n" % ( # goid, "\t%s"%self.taxon if self.taxon is not None else "", len(positives), i+1, ss_squeeze.kendalltau_list[i], # ss_squeeze.num_unranked_list[i], ss_squeeze.max_unranked_stretch_list[i], ss_squeeze.max_d_list[i], ss_squeeze.UB_list[i], # ss_squeeze.eval_stats_list[i][0], ss_squeeze.eval_stats_list[i][1], ss_squeeze.eval_stats_list[i][2], ss_squeeze.eval_stats_list[i][3]) # for i in range(ss_squeeze.num_iters))) ## if they're different dimensions, then set the others to zeros #if len(scores_arr) < goid_scores.shape[1]: # scores_arr = np.append(scores_arr, [0]*(goid_scores.shape[1] - len(scores_arr))) # limit the scores to the target nodes if len(run_obj.target_prots) != len(scores_arr): mask = np.ones(len(scores_arr), np.bool) mask[run_obj.target_prots] = 0 scores_arr[mask] = 0 goid_scores[idx] = scores_arr # make sure 0s are removed #goid_scores.eliminate_zeros() # also keep track of the time it takes for each of the parameter sets alg_name = "%s%s" % (alg, run_obj.params_str) #params_results["%s_wall_time"%alg_name] += wall_time params_results["%s_process_time" % alg_name] += process_time params_results["%s_update_time" % alg_name] += update_time run_obj.goid_scores = goid_scores.tocsr() run_obj.params_results = params_results run_obj.goid_rank_stats = goid_rank_stats return