def setup_variables(config_map, out_pref='', **kwargs): """ Function to setup the various args specified in kwargs """ input_settings = config_map['input_settings'] #input_dir = input_settings['input_dir'] alg_settings = config_map['algs'] output_settings = config_map['output_settings'] # update the settings specified in this script with those set in the yaml file if config_map.get('eval_settings'): kwargs.update(config_map['eval_settings']) if config_map.get('plot_settings'): #config_map['plot_settings'].update(kwargs) kwargs.update(config_map['plot_settings']) # overwrite whatever is in the plot settings with the specified args if kwargs.get('out_pref') and out_pref != '': del kwargs['out_pref'] #kwargs['out_pref'] = out_pref elif kwargs.get('out_pref'): out_pref = kwargs['out_pref'] if kwargs.get('term_stats') is not None: df_stats_all = pd.DataFrame() for f in kwargs['term_stats']: df_stats = pd.read_csv(f, sep='\t') df_stats_all = pd.concat([df_stats_all, df_stats]) kwargs['term_stats'] = df_stats_all # if no postfix was set in the yaml file or in this script, then set it to empty if kwargs.get('postfix') is None: kwargs['postfix'] = '' if out_pref == "": out_pref = "%s/viz/%s/%s/" % ( output_settings['output_dir'], input_settings['datasets'][0]['net_version'], input_settings['datasets'][0]['exp_name']) if kwargs.get('only_terms_file') is not None: only_terms = pd.read_csv(kwargs['only_terms_file'], sep='\t', index_col=None) only_terms = only_terms.iloc[:, 0].values print("limitting to %d terms from %s" % (len(only_terms), kwargs['only_terms_file'])) kwargs['only_terms'] = only_terms # setup the name to add to the output file only_terms_postfix = kwargs['only_terms_name'].lower() + str( len(kwargs['only_terms'])) + '-' out_pref += only_terms_postfix # TODO only create the output dir if plots are will be created if out_pref is not None: out_pref += kwargs.get('postfix', '') utils.checkDir(os.path.dirname(out_pref)) return input_settings, alg_settings, output_settings, out_pref, kwargs
def main(config_map, **kwargs): input_settings = config_map['input_settings'] #input_dir = input_settings['input_dir'] alg_settings = config_map['algs'] output_settings = config_map['output_settings'] if kwargs.get('term_stats') is not None: df_stats_all = pd.DataFrame() for f in kwargs['term_stats']: df_stats = pd.read_csv(f, sep='\t') df_stats_all = pd.concat([df_stats_all, df_stats]) kwargs['term_stats'] = df_stats_all utils.checkDir(os.path.dirname(kwargs['out_pref'])) # plot prec-rec separately from everything else if kwargs['prec_rec']: # loop through all specified terms, or use an empty string if no terms were specified terms = kwargs['goterm'] if kwargs['goterm'] is not None else [''] for term in terms: term = '-' + term if term != '' else '' prec_rec = 'prec-rec' + term #kwargs['prec_rec'] = prec_rec df_all = load_all_results(input_settings, alg_settings, output_settings, prec_rec_str=prec_rec, **kwargs) if len(df_all) == 0: print("no terms found. Quitting") sys.exit() title = '-'.join(df_all['plot_exp_name'].unique()) plot_curves(df_all, title=title, **kwargs) else: # get the path to the specified files for each alg df_all = load_all_results(input_settings, alg_settings, output_settings, **kwargs) if len(df_all) == 0: print("no terms found. Quitting") sys.exit() algs = df_all['Algorithm'].unique() print("\t%d algorithms, %d plot_exp_name values\n" % (len(algs), len(df_all['plot_exp_name'].unique()))) #print(df_all.head()) results_overview(df_all, measures=kwargs['measures']) # TODO currently only handles one dataset title = '-'.join(df_all['plot_exp_name'].unique()) # now attempt to figure out what labels/titles to put in the plot based on the net version, exp_name, and plot_exp_name for measure in kwargs['measures']: if kwargs['boxplot']: plot_boxplot(df_all, measure=measure, title=title, **kwargs) if kwargs['scatter']: plot_scatter(df_all, measure=measure, title=title, **kwargs)
def save_net(self, out_file): print("Writing %s" % (out_file)) utils.checkDir(os.path.dirname(out_file)) if out_file.endswith('.npz'): # when the net was loaded, the idx file was already written # so no need to write it again sp.save_npz(out_file, self.W_SWSN) else: # convert the adjacency matrix to an edgelist G = nx.from_scipy_sparse_matrix(self.W_SWSN) idx2node = {i: n for i, n in enumerate(self.nodes)} # see also convert_node_labels_to_integers G = nx.relabel_nodes(G, idx2node, copy=False) delimiter = '\t' if out_file.endswith('.csv'): delimiter = ',' nx.write_weighted_edgelist(G, out_file, delimiter=delimiter)
def main(sparse_net_file, obo_file, pos_neg_file=None, gaf_file=None, ignore_ec=["IEA"], alpha=.5, theta=.5, mu=.5, h="bp", out_pref=None): W, prots = alg_utils.setup_sparse_network(sparse_net_file) # parse the go_dags first as it also sets up the goid_to_category dictionary go_dags = go_examples.parse_obo_file_and_build_dags(obo_file) dag_matrix, ann_matrix, goids = build_h_ann_matrices(prots, go_dags, pos_neg_file=pos_neg_file, gaf_file=gaf_file, h='bp') # make sure they're type float so matlab will parse them correctly sparse_net = W.astype('float') ann_matrix = ann_matrix.astype('float') dag_matrix = dag_matrix.astype('float') if out_pref is not None: out_file = "%s%s-annotations-and-go-dag.mat" % (out_pref, h) utils.checkDir(os.path.dirname(out_file)) print("\twriting graph, annotation, and hierarchy matrices to %s" % (out_file)) # write these to a file to run the matlab BirgRank savemat(out_file, {"G": sparse_net, "R": ann_matrix, "H": dag_matrix}, do_compression=True) goids_file = "%s%s-goids.txt" % (out_pref, h) print("\twriting goids to %s" % (goids_file)) with open(goids_file, 'w') as out: out.write(''.join("%s\n" % (goid) for goid in goids)) run_birgrank = True if run_birgrank is True: Xh = birgRank(sparse_net, ann_matrix.transpose(), dag_matrix, alpha=.5, theta=.5, mu=.5, eps=0.0001, max_iters=1000, verbose=True) Xh = Xh.T print(Xh.shape) out_file = "%s%s-pred-scores.txt" % (out_pref, h) print("\twriting scores to %s" % (out_file)) # write the results for a single GO term with open(out_file, 'w') as out: for i in range(Xh.shape[0]): print("writing results for goterm %s" % (goids[i])) out.write(''.join("%s\t%s\t%s\n" % (goids[i], prots[j], score) for j, score in enumerate(Xh[i].toarray().flatten()))) break return
def run_cv_all_goterms( alg_runners, ann_obj, folds=5, num_reps=1, cv_seed=None, **kwargs): """ Split the positives and negatives into folds across all GO terms and then run the algorithms on those folds. Algorithms are all run on the same split of data. *num_reps*: Number of times to repeat cross-validation. An output file will be written for each repeat *cv_seed*: Seed to use for the random number generator when splitting the annotations into folds If *num_reps* > 1, the seed will be incremented by 1 each time """ ann_matrix = ann_obj.ann_matrix goids, prots = ann_obj.goids, ann_obj.prots # set the cv_seed if specified # 2019-06-26 BUG: If there are a different number of terms, or the order of the terms changed, then the results won't be the same #if cv_seed is not None: # print("\nSetting the Random State seed to %d" % (cv_seed)) # np.random.seed(cv_seed) # first check to see if the algorithms have already been run # and if the results should be overwritten if kwargs['forcealg'] is True or len(goids) == 1: # runners_to_run is a list of runners for each repitition runners_to_run = {i: alg_runners for i in range(1,num_reps+1)} else: runners_to_run = {} # a different file is stored for each repitition, so check each one for rep in range(1,num_reps+1): curr_runners_to_run = [] curr_seed = cv_seed if curr_seed is not None: # add the current repitition number to the seed curr_seed += rep-1 for run_obj in alg_runners: out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % ( run_obj.out_dir, folds, rep, "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str) if os.path.isfile(out_file): print("%s already exists. Use --forcealg to overwite" % (out_file)) else: curr_runners_to_run.append(run_obj) runners_to_run[rep] = curr_runners_to_run # repeat the CV process the specified number of times for rep in range(1,num_reps+1): if len(runners_to_run[rep]) == 0: continue curr_seed = cv_seed if curr_seed is not None: # add the current repitition number to the seed curr_seed += rep-1 # split the annotation matrix into training and testing matrices K times ann_matrix_folds = split_cv_all_goterms(ann_obj, folds=folds, seed=curr_seed, **kwargs) for run_obj in runners_to_run[rep]: # because each fold contains a different set of positives, and combined they contain all positives, # store all of the prediction scores from each fold in a matrix combined_fold_scores = sparse.lil_matrix(ann_matrix.shape, dtype=np.float) for curr_fold, (train_ann_mat, test_ann_mat) in enumerate(ann_matrix_folds): print("* "*20) print("Fold %d" % (curr_fold+1)) # change the annotation matrix to the current fold curr_ann_obj = setup.Sparse_Annotations(train_ann_mat, goids, prots) # replace the ann_obj in the runner with the current fold's annotations run_obj.ann_obj = curr_ann_obj run_obj.train_mat = train_ann_mat run_obj.test_mat = test_ann_mat #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, net_obj, curr_ann_obj, **kwargs) # now setup the inputs for the runners run_obj.setupInputs() # run the alg run_obj.run() # parse the outputs. Only needed for the algs that write output files run_obj.setupOutputs() # store only the scores of the test (left out) positives and negatives for i in range(len(goids)): test_pos, test_neg = alg_utils.get_goid_pos_neg(test_ann_mat, i) curr_goid_scores = run_obj.goid_scores[i].toarray().flatten() curr_comb_scores = combined_fold_scores[i].toarray().flatten() curr_comb_scores[test_pos] = curr_goid_scores[test_pos] curr_comb_scores[test_neg] = curr_goid_scores[test_neg] combined_fold_scores[i] = curr_comb_scores # replace the goid_scores in the runner to combined_fold_scores to evaluate run_obj.goid_scores = combined_fold_scores #curr_goids = dag_goids if alg == 'birgrank' else goids # now evaluate the results and write to a file out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % ( run_obj.out_dir, folds, rep, "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str) utils.checkDir(os.path.dirname(out_file)) eval_utils.evaluate_ground_truth( run_obj, ann_obj, out_file, #non_pos_as_neg_eval=opts.non_pos_as_neg_eval, alg=run_obj.name, append=False, **kwargs) print("Finished running cross-validation") return
def weight_SWSN(ann_matrix, sparse_nets=None, normalized_nets=None, net_names=None, out_file=None, nodes=None, verbose=False): """ *normalized_nets*: list of networks stored as scipy sparse matrices. Should already be normalized """ # UPDATED: normalize the networks if sparse_nets is not None: print("Normalizing the networks") normalized_nets = [] for net in sparse_nets: normalized_nets.append(_net_normalize(net)) elif normalized_nets is None: print("No networks given. Nothing to do") return None, 0 if len(normalized_nets) == 1: print("Only one network given to weight_SWSN. Nothing to do.") total_time = 0 return sparse_nets[0], total_time if verbose: print("Removing rows with 0 annotations/positives") utils.print_memory_usage() # remove rows with 0 annotations/positives empty_rows = [] for i in range(ann_matrix.shape[0]): pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i) # the combineWeightsSWSN method doesn't seem to # work if there's only 1 positive if len(pos) <= 1 or len(neg) <= 1: empty_rows.append(i) # don't modify the original annotation matrix to keep the rows matching the GO ids curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows) if verbose: utils.print_memory_usage() print("Weighting networks for %d different GO terms" % (curr_ann_mat.shape[0])) print("Running simultaneous weights with specific negatives") start_time = time.process_time() alpha, indices = combineNetworksSWSN(curr_ann_mat, normalized_nets, verbose=verbose) # print out the computed weights for each network if net_names is not None: print("network weights:") #print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices]))) weights = defaultdict(int) for i in range(len(alpha)): weights[net_names[indices[i]]] = alpha[i] weights_table = ["%0.3e" % weights[net] for net in net_names] print('\t'.join(net_names)) print('\t'.join(weights_table)) # now add the networks together with the alpha weight applied weights_list = [0] * len(normalized_nets) weights_list[indices[0]] = alpha[0] combined_network = alpha[0] * normalized_nets[indices[0]] for i in range(1, len(alpha)): combined_network += alpha[i] * normalized_nets[indices[i]] weights_list[indices[i]] = alpha[i] total_time = time.process_time() - start_time if out_file is not None: # replace the .txt if present out_file = out_file.replace('.txt', '.npz') utils.checkDir(os.path.dirname(out_file)) print("\twriting combined network to %s" % (out_file)) sp.save_npz(out_file, combined_network) # also write the node ids so it's easier to access # TODO figure out a better way to store this node2idx_file = out_file + "-node-ids.txt" print("\twriting node ids to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join("%s\t%s\n" % (n, i) for i, n in enumerate(nodes))) # write the alpha/weight of the networks as well net_weight_file = out_file + "-net-weights.txt" print("\twriting network weights to %s" % (net_weight_file)) with open(net_weight_file, 'w') as out: out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i])) for i, idx in enumerate(indices))) return combined_network, total_time, weights_list
def run_algs(alg_runners, **kwargs): """ Runs all of the specified algorithms with the given network and annotations. Each runner should return the GO term prediction scores for each node in a sparse matrix. """ # first check to see if the algorithms have already been run # and if the results should be overwritten for run_obj in alg_runners: out_file = "%s/pred-scores%s.txt" % (run_obj.out_dir, run_obj.params_str) run_obj.out_file = out_file run_obj.out_pref = out_file.replace(".txt", "") if kwargs['forcealg'] is True or kwargs['num_pred_to_write'] == 0: runners_to_run = alg_runners else: runners_to_run = [] for run_obj in alg_runners: if os.path.isfile(run_obj.out_file): print("%s already exists. Use --forcealg to overwite" % (run_obj.out_file)) else: runners_to_run.append(run_obj) params_results = {} print("Generating inputs") # now setup the inputs for the runners for run_obj in runners_to_run: run_obj.setupInputs() print("Running the algorithms") # run the algs # TODO storing all of the runners scores simultaneously could be costly (too much RAM). for run_obj in runners_to_run: run_obj.run() print(run_obj.params_results) params_results.update(run_obj.params_results) # parse the outputs. Only needed for the algs that write output files for run_obj in runners_to_run: run_obj.setupOutputs() # write to file if specified num_pred_to_write = kwargs['num_pred_to_write'] if kwargs.get('factor_pred_to_write') is not None: # make a dictionary with the # ann*factor for each term num_pred_to_write = {} for i in range(run_obj.ann_matrix.shape[0]): y = run_obj.ann_matrix[i, :] positives = (y > 0).nonzero()[1] num_pred_to_write[run_obj.goids[i]] = len( positives) * kwargs['factor_pred_to_write'] if num_pred_to_write != 0: # TODO generate the output file paths in the runner object #out_file = run_obj.out_file utils.checkDir(os.path.dirname(run_obj.out_file)) alg_utils.write_output(run_obj.goid_scores, run_obj.ann_obj.goids, run_obj.ann_obj.prots, run_obj.out_file, num_pred_to_write=num_pred_to_write) eval_loso.write_stats_file(runners_to_run, params_results) print(params_results) print("Finished")
def setup_net(input_dir, dataset, **kwargs): # load the network matrix and protein IDs net_files = None if 'net_files' in dataset: net_files = [ "%s/%s/%s" % (input_dir, dataset['net_version'], net_file) for net_file in dataset['net_files'] ] unweighted = dataset['net_settings'].get( 'unweighted', False) if 'net_settings' in dataset else False # if multiple networks are passed in, then set multi_net to True automatically if (net_files is not None and len(net_files) > 1) or 'string_net_files' in dataset: if dataset.get('multi_net') is False: print( "WARNING: multiple networks were passed in. Setting 'multi_net' to True" ) dataset['multi_net'] = True # parse and store the networks if dataset.get('multi_net') is True: # if multiple file names are passed in, then map each one of them if net_files is not None or 'string_net_files' in dataset: string_net_files = [ "%s/%s/%s" % (input_dir, dataset['net_version'], string_net_file) for string_net_file in dataset['string_net_files'] ] string_nets = None if 'string_nets' in dataset['net_settings']: string_nets = string_utils.convert_string_naming_scheme( dataset['net_settings']['string_nets']) # they all need to have the same rows and columns, which is handled by this function # this function also creates the multi net file if it doesn't exist string_cutoff = dataset['net_settings'].get('string_cutoff', 150) out_pref = "%s/sparse-nets/c%d-" % (os.path.dirname( string_net_files[0]), string_cutoff) utils.checkDir(os.path.dirname(out_pref)) sparse_nets, net_names, prots = setup.create_sparse_net_file( out_pref, net_files=net_files, string_net_files=string_net_files, string_nets=string_nets, string_cutoff=string_cutoff, forcenet=kwargs.get('forcenet', False)) else: # if a .mat file with multiple sparse matrix networks inside of it is passed in, read that here net_names_file = "%s/%s/%s" % ( input_dir, dataset['net_version'], dataset['net_settings']['net_names_file']) node_ids_file = "%s/%s/%s" % ( input_dir, dataset['net_version'], dataset['net_settings']['node_ids_file']) sparse_nets, net_names, prots = alg_utils.read_multi_net_file( net_file, net_names_file, node_ids_file) weight_method = dataset['net_settings']['weight_method'].lower() net_obj = setup.Sparse_Networks(sparse_nets, prots, net_names=net_names, weight_method=weight_method, unweighted=unweighted, verbose=kwargs.get('verbose', False)) else: if net_files is None: print( "ERROR: no net files specified in the config file. Must provide either 'net_files', or 'string_net_files'" ) sys.exit() W, prots = alg_utils.setup_sparse_network(net_files[0], forced=kwargs.get( 'forcenet', False)) net_obj = setup.Sparse_Networks(W, prots, unweighted=unweighted, verbose=kwargs.get('verbose', False)) return net_obj
def run(config_map, **kwargs): input_settings = config_map['input_settings'] input_dir = input_settings['input_dir'] alg_settings = config_map['algs'] output_settings = config_map['output_settings'] postfix = kwargs.get("postfix") # combine the evaluation settings in the config file and the kwargs kwargs.update(config_map['eval_settings']) # if specified, use this postfix, meaning overwrite the postfix from the yaml file if postfix is not None: kwargs['postfix'] = postfix # otherwise use the default empty string elif kwargs.get('postfix') is None: kwargs['postfix'] = "" for dataset in input_settings['datasets']: # add options specified for this dataset to kwargs # youngs_neg: for a term t, a gene g cannot be a negative for t if g shares an annotation with any gene annotated to t kwargs['youngs_neg'] = dataset.get('youngs_neg') # leaf_terms_only: limit the terms to only those that are the most specific, meaning remove the ancestors of all terms kwargs['leaf_terms_only'] = dataset.get('leaf_terms_only') # sp_leaf_terms_only: limit the terms to only those that are the most specific, meaning remove the ancestors of all terms kwargs['sp_leaf_terms_only'] = dataset.get('sp_leaf_terms_only') net_obj, ann_obj, eval_ann_obj = setup_dataset(dataset, input_dir, alg_settings, **kwargs) # if there are no annotations, then skip this dataset if len(ann_obj.goids) == 0: print("No terms found. Skipping this dataset") continue # the outputs will follow this structure: # outputs/<net_version>/<exp_name>/<alg_name>/output_files out_dir = "%s/%s/%s/" % (output_settings['output_dir'], dataset['net_version'], dataset['exp_name']) alg_runners = setup_runners(alg_settings, net_obj, ann_obj, out_dir, **kwargs) # first run prediction mode since it is the fastest if kwargs['only_eval'] is False: # run algorithms in "prediction" mode run_algs(alg_runners, **kwargs) # if specified, write the SWSN combined network to a file save_net = dataset['net_settings'].get( 'save_net', None) if 'net_settings' in dataset else None if net_obj.weight_swsn is True and save_net is not None: out_file = "%s/%s/%s" % (input_dir, dataset['net_version'], save_net) # the SWSN network is part of the runner object. Need to organize that better net_obj.save_net(out_file) # if a pos_neg_file_eval was passed in (e.g., for temporal holdout validation), # use it to evaluate the predictions if eval_ann_obj is not None: exp_type = "eval" # For LOSO, 'all-sp-loso' was used in the past #if kwargs.get('keep_ann') is not None: # exp_type="all-sp-loso" for run_obj in alg_runners: out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type, run_obj.params_str, kwargs.get("postfix", "")) utils.checkDir(os.path.dirname(out_file)) eval_utils.evaluate_ground_truth(run_obj, eval_ann_obj, out_file, **kwargs) if kwargs['cross_validation_folds'] is not None: # run cross validation cross_validation.run_cv_all_goterms( alg_runners, ann_obj, folds=kwargs['cross_validation_folds'], **kwargs) if kwargs['loso'] is True: # add the taxon file paths for this dataset to kwargs for arg in ['taxon_file', 'only_taxon_file']: kwargs[arg] = "%s/%s" % (input_dir, dataset[arg]) # now run the leave-one-species-out eval eval_loso.eval_loso(alg_runners, ann_obj, eval_ann_obj=eval_ann_obj, **kwargs)
def run_and_eval_algs(run_obj, ann_obj, train_ann_mat, test_ann_mat, taxon=None, **kwargs): goids, prots = ann_obj.goids, ann_obj.prots dag_matrix = ann_obj.dag_matrix params_results = defaultdict(int) if kwargs.get('keep_ann', False) is True: print("Keeping all annotations when making predictions") elif kwargs.get('non_pos_as_neg_eval', False) is True: print( "Evaluating using all non-ground-truth positives for the taxon as false positives" ) else: print( "Evaluating using only the ground-truth negatives predicted as positives as false positives" ) # change the annotation matrix to the current training positive examples curr_ann_obj = setup.Sparse_Annotations(dag_matrix, train_ann_mat, goids, prots) # make an ann obj with the test ann mat test_ann_obj = setup.Sparse_Annotations(dag_matrix, test_ann_mat, goids, prots) # if this is a gene based method, then run it on only the nodes which have a pos/neg annotation # unless specified otherwise by the "run_all_nodes" flag if run_obj.get_alg_type( ) == 'gene-based' and not run_obj.kwargs.get("run_all_nodes"): # sum the boolean of the columns, then use nonzero to get the columns with a nonzero value run_obj.kwargs['nodes_to_run'] = (test_ann_mat != 0).sum( axis=0).nonzero()[1] print("\trunning %s using only the %d pos/neg nodes" % (run_obj.name, len(run_obj.kwargs['nodes_to_run']))) # setup the output file. Could be used by the runners to write temp files or other output files exp_type = "loso" postfix = kwargs.get("postfix", "") if kwargs['keep_ann']: exp_type = "eval-per-taxon" out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type, run_obj.params_str, postfix) run_obj.out_pref = out_file.replace('.txt', '') utils.checkDir(os.path.dirname(out_file)) # for sinksource_bounds, keep track of which nodes are either a left-out pos or left-out neg if run_obj.name in ['sinksource_bounds', 'sinksourceplus_bounds']: run_obj.params['rank_pos_neg'] = test_ann_mat # if predictions were already generated, and taxon is set to 'all', then use those. # otherwise, generate the prediction scores if kwargs['keep_ann'] and run_obj.goid_scores.getnnz() != 0: print("Using already computed scores") else: # replace the ann_obj in the runner with the current training annotations run_obj.ann_obj = curr_ann_obj #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, curr_ann_obj, **kwargs) if kwargs.get('verbose'): utils.print_memory_usage() run_obj.setupInputs() if kwargs.get('verbose'): utils.print_memory_usage() run_obj.run() if kwargs.get('verbose'): utils.print_memory_usage() run_obj.setupOutputs(taxon=taxon) # now evaluate # this will write a file containing the fmax and other measures for each goterm # with the taxon name in the name of the file eval_utils.evaluate_ground_truth( run_obj, test_ann_obj, out_file, #non_pos_as_neg_eval=opts.non_pos_as_neg_eval, taxon=taxon, append=True, **kwargs) for key in run_obj.params_results: params_results[key] += run_obj.params_results[key] return params_results
def weight_SWSN(ann_matrix, sparse_nets, net_names=None, out_file=None, nodes=None): """ TODO DOC """ if len(sparse_nets) == 1: print("Only one network given to weight_SWSN. Nothing to do.") total_time = 0 return sparse_nets[0], total_time # remove rows with 0 annotations/positives empty_rows = [] for i in range(ann_matrix.shape[0]): pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i) # the combineWeightsSWSN method doesn't seem to # work if there's only 1 positive if len(pos) <= 1 or len(neg) <= 1: empty_rows.append(i) # don't modify the original annotation matrix to keep the rows matching the GO ids curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows) # normalize the networks print("Normalizing the networks") normalized_nets = [] for net in sparse_nets: normalized_nets.append(_net_normalize(net)) print("Weighting networks for %d different GO terms" % (curr_ann_mat.shape[0])) print("Running simultaneous weights with specific negatives") start_time = time.process_time() alpha, indices = combineNetworksSWSN(curr_ann_mat, normalized_nets) if net_names is not None: print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices]))) # now add the networks together with the alpha weight applied combined_network = alpha[0] * sparse_nets[indices[0]] for i in range(1, len(alpha)): combined_network += alpha[i] * sparse_nets[indices[i]] total_time = time.process_time() - start_time if out_file is not None: # replace the .txt if present out_file = out_file.replace('.txt', '.npz') utils.checkDir(os.path.dirname(out_file)) print("\twriting combined network to %s" % (out_file)) sparse.save_npz(out_file, combined_network) # also write the node ids so it's easier to access # TODO figure out a better way to store this node2idx_file = out_file + "-node-ids.txt" print("\twriting node ids to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join("%s\t%s\n" % (n, i) for i, n in enumerate(nodes))) # write the alpha/weight of the networks as well net_weight_file = out_file + "-net-weights.txt" print("\twriting network weights to %s" % (net_weight_file)) with open(net_weight_file, 'w') as out: out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i])) for i, idx in enumerate(indices))) return combined_network, total_time