Esempio n. 1
0
def setup_variables(config_map, out_pref='', **kwargs):
    """
    Function to setup the various args specified in kwargs
    """
    input_settings = config_map['input_settings']
    #input_dir = input_settings['input_dir']
    alg_settings = config_map['algs']
    output_settings = config_map['output_settings']
    # update the settings specified in this script with those set in the yaml file
    if config_map.get('eval_settings'):
        kwargs.update(config_map['eval_settings'])
    if config_map.get('plot_settings'):
        #config_map['plot_settings'].update(kwargs)
        kwargs.update(config_map['plot_settings'])
        # overwrite whatever is in the plot settings with the specified args
        if kwargs.get('out_pref') and out_pref != '':
            del kwargs['out_pref']
            #kwargs['out_pref'] = out_pref
        elif kwargs.get('out_pref'):
            out_pref = kwargs['out_pref']
    if kwargs.get('term_stats') is not None:
        df_stats_all = pd.DataFrame()
        for f in kwargs['term_stats']:
            df_stats = pd.read_csv(f, sep='\t')
            df_stats_all = pd.concat([df_stats_all, df_stats])
        kwargs['term_stats'] = df_stats_all

    # if no postfix was set in the yaml file or in this script, then set it to empty
    if kwargs.get('postfix') is None:
        kwargs['postfix'] = ''

    if out_pref == "":
        out_pref = "%s/viz/%s/%s/" % (
            output_settings['output_dir'],
            input_settings['datasets'][0]['net_version'],
            input_settings['datasets'][0]['exp_name'])
    if kwargs.get('only_terms_file') is not None:
        only_terms = pd.read_csv(kwargs['only_terms_file'],
                                 sep='\t',
                                 index_col=None)
        only_terms = only_terms.iloc[:, 0].values
        print("limitting to %d terms from %s" %
              (len(only_terms), kwargs['only_terms_file']))
        kwargs['only_terms'] = only_terms
        # setup the name to add to the output file
        only_terms_postfix = kwargs['only_terms_name'].lower() + str(
            len(kwargs['only_terms'])) + '-'
        out_pref += only_terms_postfix

    # TODO only create the output dir if plots are will be created
    if out_pref is not None:
        out_pref += kwargs.get('postfix', '')
        utils.checkDir(os.path.dirname(out_pref))

    return input_settings, alg_settings, output_settings, out_pref, kwargs
Esempio n. 2
0
def main(config_map, **kwargs):
    input_settings = config_map['input_settings']
    #input_dir = input_settings['input_dir']
    alg_settings = config_map['algs']
    output_settings = config_map['output_settings']
    if kwargs.get('term_stats') is not None:
        df_stats_all = pd.DataFrame()
        for f in kwargs['term_stats']:
            df_stats = pd.read_csv(f, sep='\t')
            df_stats_all = pd.concat([df_stats_all, df_stats])
        kwargs['term_stats'] = df_stats_all

    utils.checkDir(os.path.dirname(kwargs['out_pref']))
    # plot prec-rec separately from everything else
    if kwargs['prec_rec']:
        # loop through all specified terms, or use an empty string if no terms were specified
        terms = kwargs['goterm'] if kwargs['goterm'] is not None else ['']
        for term in terms:
            term = '-' + term if term != '' else ''
            prec_rec = 'prec-rec' + term
            #kwargs['prec_rec'] = prec_rec
            df_all = load_all_results(input_settings,
                                      alg_settings,
                                      output_settings,
                                      prec_rec_str=prec_rec,
                                      **kwargs)
            if len(df_all) == 0:
                print("no terms found. Quitting")
                sys.exit()

            title = '-'.join(df_all['plot_exp_name'].unique())
            plot_curves(df_all, title=title, **kwargs)
    else:
        # get the path to the specified files for each alg
        df_all = load_all_results(input_settings, alg_settings,
                                  output_settings, **kwargs)
        if len(df_all) == 0:
            print("no terms found. Quitting")
            sys.exit()
        algs = df_all['Algorithm'].unique()

        print("\t%d algorithms, %d plot_exp_name values\n" %
              (len(algs), len(df_all['plot_exp_name'].unique())))
        #print(df_all.head())
        results_overview(df_all, measures=kwargs['measures'])

        # TODO currently only handles one dataset
        title = '-'.join(df_all['plot_exp_name'].unique())

        # now attempt to figure out what labels/titles to put in the plot based on the net version, exp_name, and plot_exp_name
        for measure in kwargs['measures']:
            if kwargs['boxplot']:
                plot_boxplot(df_all, measure=measure, title=title, **kwargs)
            if kwargs['scatter']:
                plot_scatter(df_all, measure=measure, title=title, **kwargs)
Esempio n. 3
0
 def save_net(self, out_file):
     print("Writing %s" % (out_file))
     utils.checkDir(os.path.dirname(out_file))
     if out_file.endswith('.npz'):
         # when the net was loaded, the idx file was already written
         # so no need to write it again
         sp.save_npz(out_file, self.W_SWSN)
     else:
         # convert the adjacency matrix to an edgelist
         G = nx.from_scipy_sparse_matrix(self.W_SWSN)
         idx2node = {i: n for i, n in enumerate(self.nodes)}
         # see also convert_node_labels_to_integers
         G = nx.relabel_nodes(G, idx2node, copy=False)
         delimiter = '\t'
         if out_file.endswith('.csv'):
             delimiter = ','
         nx.write_weighted_edgelist(G, out_file, delimiter=delimiter)
def main(sparse_net_file, obo_file, pos_neg_file=None, gaf_file=None, ignore_ec=["IEA"],
         alpha=.5, theta=.5, mu=.5, h="bp", out_pref=None):

    W, prots = alg_utils.setup_sparse_network(sparse_net_file)
    # parse the go_dags first as it also sets up the goid_to_category dictionary
    go_dags = go_examples.parse_obo_file_and_build_dags(obo_file)

    dag_matrix, ann_matrix, goids = build_h_ann_matrices(prots, go_dags, pos_neg_file=pos_neg_file, gaf_file=gaf_file, h='bp')
    # make sure they're type float so matlab will parse them correctly
    sparse_net = W.astype('float') 
    ann_matrix = ann_matrix.astype('float') 
    dag_matrix = dag_matrix.astype('float')

    if out_pref is not None:
        out_file = "%s%s-annotations-and-go-dag.mat" % (out_pref, h)
        utils.checkDir(os.path.dirname(out_file))

        print("\twriting graph, annotation, and hierarchy matrices to %s" % (out_file))
        # write these to a file to run the matlab BirgRank 
        savemat(out_file, {"G": sparse_net, "R": ann_matrix, "H": dag_matrix}, do_compression=True)

        goids_file = "%s%s-goids.txt" % (out_pref, h)
        print("\twriting goids to %s" % (goids_file))
        with open(goids_file, 'w') as out:
            out.write(''.join("%s\n" % (goid) for goid in goids))

    run_birgrank = True 
    if run_birgrank is True:
        Xh = birgRank(sparse_net, ann_matrix.transpose(), dag_matrix, alpha=.5, theta=.5, mu=.5, eps=0.0001, max_iters=1000, verbose=True)
        Xh = Xh.T
        print(Xh.shape)

        out_file = "%s%s-pred-scores.txt" % (out_pref, h)
        print("\twriting scores to %s" % (out_file))
        # write the results for a single GO term
        with open(out_file, 'w') as out:
            for i in range(Xh.shape[0]):
                print("writing results for goterm %s" % (goids[i]))
                out.write(''.join("%s\t%s\t%s\n" % (goids[i], prots[j], score) for j, score in enumerate(Xh[i].toarray().flatten())))
                break
    return
def run_cv_all_goterms(
        alg_runners, ann_obj, folds=5, num_reps=1, 
        cv_seed=None, **kwargs):
    """
    Split the positives and negatives into folds across all GO terms
    and then run the algorithms on those folds.
    Algorithms are all run on the same split of data. 
    *num_reps*: Number of times to repeat cross-validation. 
    An output file will be written for each repeat
    *cv_seed*: Seed to use for the random number generator when splitting the annotations into folds
        If *num_reps* > 1, the seed will be incremented by 1 each time
    """
    ann_matrix = ann_obj.ann_matrix
    goids, prots = ann_obj.goids, ann_obj.prots

    # set the cv_seed if specified
    # 2019-06-26 BUG: If there are a different number of terms, or the order of the terms changed, then the results won't be the same
    #if cv_seed is not None:
    #    print("\nSetting the Random State seed to %d" % (cv_seed))
    #    np.random.seed(cv_seed)

    # first check to see if the algorithms have already been run
    # and if the results should be overwritten
    if kwargs['forcealg'] is True or len(goids) == 1:
        # runners_to_run is a list of runners for each repitition
        runners_to_run = {i: alg_runners for i in range(1,num_reps+1)}
    else:
        runners_to_run = {}
        # a different file is stored for each repitition, so check each one
        for rep in range(1,num_reps+1):
            curr_runners_to_run = [] 
            curr_seed = cv_seed
            if curr_seed is not None:
                # add the current repitition number to the seed
                curr_seed += rep-1
            for run_obj in alg_runners:
                out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                    run_obj.out_dir, folds, rep,
                    "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
                if os.path.isfile(out_file):
                    print("%s already exists. Use --forcealg to overwite" % (out_file))
                else:
                    curr_runners_to_run.append(run_obj)
            runners_to_run[rep] = curr_runners_to_run

    # repeat the CV process the specified number of times
    for rep in range(1,num_reps+1):
        if len(runners_to_run[rep]) == 0:
            continue
        curr_seed = cv_seed
        if curr_seed is not None:
            # add the current repitition number to the seed
            curr_seed += rep-1
        # split the annotation matrix into training and testing matrices K times
        ann_matrix_folds = split_cv_all_goterms(ann_obj, folds=folds, seed=curr_seed, **kwargs)

        for run_obj in runners_to_run[rep]:
            # because each fold contains a different set of positives, and combined they contain all positives,
            # store all of the prediction scores from each fold in a matrix
            combined_fold_scores = sparse.lil_matrix(ann_matrix.shape, dtype=np.float)
            for curr_fold, (train_ann_mat, test_ann_mat) in enumerate(ann_matrix_folds):
                print("*  "*20)
                print("Fold %d" % (curr_fold+1))

                # change the annotation matrix to the current fold
                curr_ann_obj = setup.Sparse_Annotations(train_ann_mat, goids, prots)
                # replace the ann_obj in the runner with the current fold's annotations  
                run_obj.ann_obj = curr_ann_obj
                run_obj.train_mat = train_ann_mat
                run_obj.test_mat = test_ann_mat
                #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, net_obj, curr_ann_obj, **kwargs)
                # now setup the inputs for the runners
                run_obj.setupInputs()
                # run the alg
                run_obj.run()
                # parse the outputs. Only needed for the algs that write output files
                run_obj.setupOutputs()

                # store only the scores of the test (left out) positives and negatives
                for i in range(len(goids)):
                    test_pos, test_neg = alg_utils.get_goid_pos_neg(test_ann_mat, i)
                    curr_goid_scores = run_obj.goid_scores[i].toarray().flatten()
                    curr_comb_scores = combined_fold_scores[i].toarray().flatten()
                    curr_comb_scores[test_pos] = curr_goid_scores[test_pos]
                    curr_comb_scores[test_neg] = curr_goid_scores[test_neg]
                    combined_fold_scores[i] = curr_comb_scores 

            # replace the goid_scores in the runner to combined_fold_scores to evaluate
            run_obj.goid_scores = combined_fold_scores 

            #curr_goids = dag_goids if alg == 'birgrank' else goids
            # now evaluate the results and write to a file
            out_file = "%s/cv-%dfolds-rep%d%s%s.txt" % (
                run_obj.out_dir, folds, rep,
                "-seed%s"%curr_seed if curr_seed is not None else "", run_obj.params_str)
            utils.checkDir(os.path.dirname(out_file)) 
            eval_utils.evaluate_ground_truth(
                run_obj, ann_obj, out_file,
                #non_pos_as_neg_eval=opts.non_pos_as_neg_eval,
                alg=run_obj.name, append=False, **kwargs)

    print("Finished running cross-validation")
    return
Esempio n. 6
0
def weight_SWSN(ann_matrix,
                sparse_nets=None,
                normalized_nets=None,
                net_names=None,
                out_file=None,
                nodes=None,
                verbose=False):
    """ 
    *normalized_nets*: list of networks stored as scipy sparse matrices. Should already be normalized
    """
    # UPDATED: normalize the networks
    if sparse_nets is not None:
        print("Normalizing the networks")
        normalized_nets = []
        for net in sparse_nets:
            normalized_nets.append(_net_normalize(net))
    elif normalized_nets is None:
        print("No networks given. Nothing to do")
        return None, 0
    if len(normalized_nets) == 1:
        print("Only one network given to weight_SWSN. Nothing to do.")
        total_time = 0
        return sparse_nets[0], total_time
    if verbose:
        print("Removing rows with 0 annotations/positives")
        utils.print_memory_usage()
    # remove rows with 0 annotations/positives
    empty_rows = []
    for i in range(ann_matrix.shape[0]):
        pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i)
        # the combineWeightsSWSN method doesn't seem to
        # work if there's only 1 positive
        if len(pos) <= 1 or len(neg) <= 1:
            empty_rows.append(i)
    # don't modify the original annotation matrix to keep the rows matching the GO ids
    curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows)

    if verbose:
        utils.print_memory_usage()
    print("Weighting networks for %d different GO terms" %
          (curr_ann_mat.shape[0]))
    print("Running simultaneous weights with specific negatives")
    start_time = time.process_time()
    alpha, indices = combineNetworksSWSN(curr_ann_mat,
                                         normalized_nets,
                                         verbose=verbose)
    # print out the computed weights for each network
    if net_names is not None:
        print("network weights:")
        #print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices])))
        weights = defaultdict(int)
        for i in range(len(alpha)):
            weights[net_names[indices[i]]] = alpha[i]
        weights_table = ["%0.3e" % weights[net] for net in net_names]
        print('\t'.join(net_names))
        print('\t'.join(weights_table))

    # now add the networks together with the alpha weight applied
    weights_list = [0] * len(normalized_nets)
    weights_list[indices[0]] = alpha[0]
    combined_network = alpha[0] * normalized_nets[indices[0]]
    for i in range(1, len(alpha)):
        combined_network += alpha[i] * normalized_nets[indices[i]]
        weights_list[indices[i]] = alpha[i]
    total_time = time.process_time() - start_time

    if out_file is not None:
        # replace the .txt if present
        out_file = out_file.replace('.txt', '.npz')
        utils.checkDir(os.path.dirname(out_file))
        print("\twriting combined network to %s" % (out_file))
        sp.save_npz(out_file, combined_network)
        # also write the node ids so it's easier to access
        # TODO figure out a better way to store this
        node2idx_file = out_file + "-node-ids.txt"
        print("\twriting node ids to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (n, i)
                              for i, n in enumerate(nodes)))

        # write the alpha/weight of the networks as well
        net_weight_file = out_file + "-net-weights.txt"
        print("\twriting network weights to %s" % (net_weight_file))
        with open(net_weight_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i]))
                              for i, idx in enumerate(indices)))

    return combined_network, total_time, weights_list
Esempio n. 7
0
def run_algs(alg_runners, **kwargs):
    """
    Runs all of the specified algorithms with the given network and annotations.
    Each runner should return the GO term prediction scores for each node in a sparse matrix.
    """
    # first check to see if the algorithms have already been run
    # and if the results should be overwritten
    for run_obj in alg_runners:
        out_file = "%s/pred-scores%s.txt" % (run_obj.out_dir,
                                             run_obj.params_str)
        run_obj.out_file = out_file
        run_obj.out_pref = out_file.replace(".txt", "")
    if kwargs['forcealg'] is True or kwargs['num_pred_to_write'] == 0:
        runners_to_run = alg_runners
    else:
        runners_to_run = []
        for run_obj in alg_runners:
            if os.path.isfile(run_obj.out_file):
                print("%s already exists. Use --forcealg to overwite" %
                      (run_obj.out_file))
            else:
                runners_to_run.append(run_obj)

    params_results = {}

    print("Generating inputs")
    # now setup the inputs for the runners
    for run_obj in runners_to_run:
        run_obj.setupInputs()

    print("Running the algorithms")
    # run the algs
    # TODO storing all of the runners scores simultaneously could be costly (too much RAM).
    for run_obj in runners_to_run:
        run_obj.run()
        print(run_obj.params_results)
        params_results.update(run_obj.params_results)

    # parse the outputs. Only needed for the algs that write output files
    for run_obj in runners_to_run:
        run_obj.setupOutputs()

        # write to file if specified
        num_pred_to_write = kwargs['num_pred_to_write']
        if kwargs.get('factor_pred_to_write') is not None:
            # make a dictionary with the # ann*factor for each term
            num_pred_to_write = {}
            for i in range(run_obj.ann_matrix.shape[0]):
                y = run_obj.ann_matrix[i, :]
                positives = (y > 0).nonzero()[1]
                num_pred_to_write[run_obj.goids[i]] = len(
                    positives) * kwargs['factor_pred_to_write']
        if num_pred_to_write != 0:
            # TODO generate the output file paths in the runner object
            #out_file = run_obj.out_file
            utils.checkDir(os.path.dirname(run_obj.out_file))
            alg_utils.write_output(run_obj.goid_scores,
                                   run_obj.ann_obj.goids,
                                   run_obj.ann_obj.prots,
                                   run_obj.out_file,
                                   num_pred_to_write=num_pred_to_write)

    eval_loso.write_stats_file(runners_to_run, params_results)
    print(params_results)
    print("Finished")
Esempio n. 8
0
def setup_net(input_dir, dataset, **kwargs):
    # load the network matrix and protein IDs
    net_files = None
    if 'net_files' in dataset:
        net_files = [
            "%s/%s/%s" % (input_dir, dataset['net_version'], net_file)
            for net_file in dataset['net_files']
        ]
    unweighted = dataset['net_settings'].get(
        'unweighted', False) if 'net_settings' in dataset else False
    # if multiple networks are passed in, then set multi_net to True automatically
    if (net_files is not None
            and len(net_files) > 1) or 'string_net_files' in dataset:
        if dataset.get('multi_net') is False:
            print(
                "WARNING: multiple networks were passed in. Setting 'multi_net' to True"
            )
        dataset['multi_net'] = True

    # parse and store the networks
    if dataset.get('multi_net') is True:
        # if multiple file names are passed in, then map each one of them
        if net_files is not None or 'string_net_files' in dataset:
            string_net_files = [
                "%s/%s/%s" %
                (input_dir, dataset['net_version'], string_net_file)
                for string_net_file in dataset['string_net_files']
            ]
            string_nets = None
            if 'string_nets' in dataset['net_settings']:
                string_nets = string_utils.convert_string_naming_scheme(
                    dataset['net_settings']['string_nets'])
            # they all need to have the same rows and columns, which is handled by this function
            # this function also creates the multi net file if it doesn't exist
            string_cutoff = dataset['net_settings'].get('string_cutoff', 150)
            out_pref = "%s/sparse-nets/c%d-" % (os.path.dirname(
                string_net_files[0]), string_cutoff)
            utils.checkDir(os.path.dirname(out_pref))
            sparse_nets, net_names, prots = setup.create_sparse_net_file(
                out_pref,
                net_files=net_files,
                string_net_files=string_net_files,
                string_nets=string_nets,
                string_cutoff=string_cutoff,
                forcenet=kwargs.get('forcenet', False))
        else:
            # if a .mat file with multiple sparse matrix networks inside of it is passed in, read that here
            net_names_file = "%s/%s/%s" % (
                input_dir, dataset['net_version'],
                dataset['net_settings']['net_names_file'])
            node_ids_file = "%s/%s/%s" % (
                input_dir, dataset['net_version'],
                dataset['net_settings']['node_ids_file'])
            sparse_nets, net_names, prots = alg_utils.read_multi_net_file(
                net_file, net_names_file, node_ids_file)

        weight_method = dataset['net_settings']['weight_method'].lower()
        net_obj = setup.Sparse_Networks(sparse_nets,
                                        prots,
                                        net_names=net_names,
                                        weight_method=weight_method,
                                        unweighted=unweighted,
                                        verbose=kwargs.get('verbose', False))
    else:
        if net_files is None:
            print(
                "ERROR: no net files specified in the config file. Must provide either 'net_files', or 'string_net_files'"
            )
            sys.exit()
        W, prots = alg_utils.setup_sparse_network(net_files[0],
                                                  forced=kwargs.get(
                                                      'forcenet', False))
        net_obj = setup.Sparse_Networks(W,
                                        prots,
                                        unweighted=unweighted,
                                        verbose=kwargs.get('verbose', False))
    return net_obj
Esempio n. 9
0
def run(config_map, **kwargs):
    input_settings = config_map['input_settings']
    input_dir = input_settings['input_dir']
    alg_settings = config_map['algs']
    output_settings = config_map['output_settings']
    postfix = kwargs.get("postfix")
    # combine the evaluation settings in the config file and the kwargs
    kwargs.update(config_map['eval_settings'])
    # if specified, use this postfix, meaning overwrite the postfix from the yaml file
    if postfix is not None:
        kwargs['postfix'] = postfix
    # otherwise use the default empty string
    elif kwargs.get('postfix') is None:
        kwargs['postfix'] = ""

    for dataset in input_settings['datasets']:
        # add options specified for this dataset to kwargs
        # youngs_neg: for a term t, a gene g cannot be a negative for t if g shares an annotation with any gene annotated to t
        kwargs['youngs_neg'] = dataset.get('youngs_neg')
        # leaf_terms_only: limit the terms to only those that are the most specific, meaning remove the ancestors of all terms
        kwargs['leaf_terms_only'] = dataset.get('leaf_terms_only')
        # sp_leaf_terms_only: limit the terms to only those that are the most specific, meaning remove the ancestors of all terms
        kwargs['sp_leaf_terms_only'] = dataset.get('sp_leaf_terms_only')

        net_obj, ann_obj, eval_ann_obj = setup_dataset(dataset, input_dir,
                                                       alg_settings, **kwargs)
        # if there are no annotations, then skip this dataset
        if len(ann_obj.goids) == 0:
            print("No terms found. Skipping this dataset")
            continue
        # the outputs will follow this structure:
        # outputs/<net_version>/<exp_name>/<alg_name>/output_files
        out_dir = "%s/%s/%s/" % (output_settings['output_dir'],
                                 dataset['net_version'], dataset['exp_name'])
        alg_runners = setup_runners(alg_settings, net_obj, ann_obj, out_dir,
                                    **kwargs)

        # first run prediction mode since it is the fastest
        if kwargs['only_eval'] is False:
            # run algorithms in "prediction" mode
            run_algs(alg_runners, **kwargs)
            # if specified, write the SWSN combined network to a file
            save_net = dataset['net_settings'].get(
                'save_net', None) if 'net_settings' in dataset else None
            if net_obj.weight_swsn is True and save_net is not None:
                out_file = "%s/%s/%s" % (input_dir, dataset['net_version'],
                                         save_net)
                # the SWSN network is part of the runner object. Need to organize that better
                net_obj.save_net(out_file)

            # if a pos_neg_file_eval was passed in (e.g., for temporal holdout validation),
            # use it to evaluate the predictions
            if eval_ann_obj is not None:
                exp_type = "eval"
                # For LOSO, 'all-sp-loso' was used in the past
                #if kwargs.get('keep_ann') is not None:
                #    exp_type="all-sp-loso"
                for run_obj in alg_runners:
                    out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type,
                                                  run_obj.params_str,
                                                  kwargs.get("postfix", ""))
                    utils.checkDir(os.path.dirname(out_file))
                    eval_utils.evaluate_ground_truth(run_obj, eval_ann_obj,
                                                     out_file, **kwargs)

        if kwargs['cross_validation_folds'] is not None:
            # run cross validation
            cross_validation.run_cv_all_goterms(
                alg_runners,
                ann_obj,
                folds=kwargs['cross_validation_folds'],
                **kwargs)

        if kwargs['loso'] is True:
            # add the taxon file paths for this dataset to kwargs
            for arg in ['taxon_file', 'only_taxon_file']:
                kwargs[arg] = "%s/%s" % (input_dir, dataset[arg])
            # now run the leave-one-species-out eval
            eval_loso.eval_loso(alg_runners,
                                ann_obj,
                                eval_ann_obj=eval_ann_obj,
                                **kwargs)
Esempio n. 10
0
def run_and_eval_algs(run_obj,
                      ann_obj,
                      train_ann_mat,
                      test_ann_mat,
                      taxon=None,
                      **kwargs):
    goids, prots = ann_obj.goids, ann_obj.prots
    dag_matrix = ann_obj.dag_matrix
    params_results = defaultdict(int)

    if kwargs.get('keep_ann', False) is True:
        print("Keeping all annotations when making predictions")
    elif kwargs.get('non_pos_as_neg_eval', False) is True:
        print(
            "Evaluating using all non-ground-truth positives for the taxon as false positives"
        )
    else:
        print(
            "Evaluating using only the ground-truth negatives predicted as positives as false positives"
        )

    # change the annotation matrix to the current training positive examples
    curr_ann_obj = setup.Sparse_Annotations(dag_matrix, train_ann_mat, goids,
                                            prots)
    # make an ann obj with the test ann mat
    test_ann_obj = setup.Sparse_Annotations(dag_matrix, test_ann_mat, goids,
                                            prots)
    # if this is a gene based method, then run it on only the nodes which have a pos/neg annotation
    # unless specified otherwise by the "run_all_nodes" flag
    if run_obj.get_alg_type(
    ) == 'gene-based' and not run_obj.kwargs.get("run_all_nodes"):
        # sum the boolean of the columns, then use nonzero to get the columns with a nonzero value
        run_obj.kwargs['nodes_to_run'] = (test_ann_mat != 0).sum(
            axis=0).nonzero()[1]
        print("\trunning %s using only the %d pos/neg nodes" %
              (run_obj.name, len(run_obj.kwargs['nodes_to_run'])))

    # setup the output file. Could be used by the runners to write temp files or other output files
    exp_type = "loso"
    postfix = kwargs.get("postfix", "")
    if kwargs['keep_ann']:
        exp_type = "eval-per-taxon"
    out_file = "%s/%s%s%s.txt" % (run_obj.out_dir, exp_type,
                                  run_obj.params_str, postfix)
    run_obj.out_pref = out_file.replace('.txt', '')
    utils.checkDir(os.path.dirname(out_file))

    # for sinksource_bounds, keep track of which nodes are either a left-out pos or left-out neg
    if run_obj.name in ['sinksource_bounds', 'sinksourceplus_bounds']:
        run_obj.params['rank_pos_neg'] = test_ann_mat

    # if predictions were already generated, and taxon is set to 'all', then use those.
    # otherwise, generate the prediction scores
    if kwargs['keep_ann'] and run_obj.goid_scores.getnnz() != 0:
        print("Using already computed scores")
    else:
        # replace the ann_obj in the runner with the current training annotations
        run_obj.ann_obj = curr_ann_obj
        #alg_runners = run_eval_algs.setup_runners([alg], alg_settings, curr_ann_obj, **kwargs)
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.setupInputs()
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.run()
        if kwargs.get('verbose'):
            utils.print_memory_usage()
        run_obj.setupOutputs(taxon=taxon)

    # now evaluate
    # this will write a file containing the fmax and other measures for each goterm
    # with the taxon name in the name of the file
    eval_utils.evaluate_ground_truth(
        run_obj,
        test_ann_obj,
        out_file,
        #non_pos_as_neg_eval=opts.non_pos_as_neg_eval,
        taxon=taxon,
        append=True,
        **kwargs)
    for key in run_obj.params_results:
        params_results[key] += run_obj.params_results[key]

    return params_results
def weight_SWSN(ann_matrix,
                sparse_nets,
                net_names=None,
                out_file=None,
                nodes=None):
    """ TODO DOC
    """
    if len(sparse_nets) == 1:
        print("Only one network given to weight_SWSN. Nothing to do.")
        total_time = 0
        return sparse_nets[0], total_time
    # remove rows with 0 annotations/positives
    empty_rows = []
    for i in range(ann_matrix.shape[0]):
        pos, neg = alg_utils.get_goid_pos_neg(ann_matrix, i)
        # the combineWeightsSWSN method doesn't seem to
        # work if there's only 1 positive
        if len(pos) <= 1 or len(neg) <= 1:
            empty_rows.append(i)
    # don't modify the original annotation matrix to keep the rows matching the GO ids
    curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows)

    # normalize the networks
    print("Normalizing the networks")
    normalized_nets = []
    for net in sparse_nets:
        normalized_nets.append(_net_normalize(net))
    print("Weighting networks for %d different GO terms" %
          (curr_ann_mat.shape[0]))
    print("Running simultaneous weights with specific negatives")
    start_time = time.process_time()
    alpha, indices = combineNetworksSWSN(curr_ann_mat, normalized_nets)
    if net_names is not None:
        print("\tnetworks chosen: %s" %
              (', '.join([net_names[i] for i in indices])))

    # now add the networks together with the alpha weight applied
    combined_network = alpha[0] * sparse_nets[indices[0]]
    for i in range(1, len(alpha)):
        combined_network += alpha[i] * sparse_nets[indices[i]]
    total_time = time.process_time() - start_time

    if out_file is not None:
        # replace the .txt if present
        out_file = out_file.replace('.txt', '.npz')
        utils.checkDir(os.path.dirname(out_file))
        print("\twriting combined network to %s" % (out_file))
        sparse.save_npz(out_file, combined_network)
        # also write the node ids so it's easier to access
        # TODO figure out a better way to store this
        node2idx_file = out_file + "-node-ids.txt"
        print("\twriting node ids to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (n, i)
                              for i, n in enumerate(nodes)))

        # write the alpha/weight of the networks as well
        net_weight_file = out_file + "-net-weights.txt"
        print("\twriting network weights to %s" % (net_weight_file))
        with open(net_weight_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i]))
                              for i, idx in enumerate(indices)))

    return combined_network, total_time