Ejemplo n.º 1
0
def gen_random_network_data(seed_graph, n_tries=10):
    log = get_logger()
    random_graph, tries = safe_perturbed_graph(seed_graph, n_tries)
    nodes = sorted(random_graph.nodes())
    D = regularized_laplacian(random_graph, nodes, lam=0.05)
    C = rkhs_factor(D)
    return dict(G=random_graph, D=D, C=C, nodes=nodes), tries
Ejemplo n.º 2
0
def main(args):
    log = get_logger()
    log.info('Loading edgelist from: %s', args.edgelist)
    seed_network = \
        util.simple_two_core(nx.read_edgelist(args.edgelist, encoding='ascii'),
                             verbose=False)

    log.info('Generating network...')
    t_start = time()
    data, tries = gen_random_network_data(seed_network, args.n_tries)
    t_end = time()
    elapsed = t_end - t_start
    log.info('Random network generated in %.2f, in %d tries', elapsed, tries)

    log.info('Saving random network to: %s', args.output)
    joblib.dump(data, args.output)
Ejemplo n.º 3
0
def run(args):
    # Load the homologs
    geneAToHomologs, geneBToHomologs = load_homolog_mapping_file(
        args.homolog_mapping_file)

    # Load the genetic interactions
    A_GI, A_SL, A = load_genetic_interactions_table(args.species_A_table_file)
    B_GI, B_SL, B = load_genetic_interactions_table(args.species_B_table_file)

    logger = get_logger()
    if args.verbose > 0:
        logger.info('* Loaded genetic interactions...')
        logger.info('\t- %s interactions in species A (%s SLs)' %
                    (len(A_GI), len(A)))
        logger.info('\t- %s interactions in species B (%s SLs)' %
                    (len(B_GI), len(B)))

    # Make the predictions. We predict an SL between (u, v) in species B iff    # Load and predict
    B_edges, B_true, B_pred = predict_sl_from_homolog_mapping(
        (A_GI, A_SL, A), (B_GI, B_SL, B), geneBToHomologs, args.verbose)

    # Compute PR and AUC
    if args.verbose > 0:
        logger.info('* Evaluating results...')
        logger.info('\tPrecision/TPR: %.3f' % precision_score(B_true, B_pred))
        logger.info('\tRecall: %.3f' % recall_score(B_true, B_pred))
        logger.info('\tFPR: %.3f' % compute_fpr(B_true, B_pred))

    ################################################################################
    # OUTPUT
    ################################################################################
    # Output predictions
    valToName = {1: "SL", 0: "Non-SL"}
    items = [{
        "Gene A": u,
        "Gene B": v,
        "Ground truth": valToName[gt],
        "Predicted": valToName[p]
    } for (u, v), gt, p in zip(B_edges, B_true, B_pred)]
    df = pd.DataFrame(items)
    df.to_csv(args.output_file, sep='\t', index=False)
Ejemplo n.º 4
0
def simplify_graph(G, verbose=True):
    '''
    Returns the simple/strict graph corresponding to given graph
    (Removes self loops from G and returns largest connected component)
    '''
    logger = get_logger()
    if (not nx.is_connected(G)):
        cc_list = list(nx.connected_component_subgraphs(G))
        cc_sizes = [len(x) for x in cc_list]
        largest_cc = max(cc_sizes)
        cc_sizes.remove(largest_cc)
        if verbose:
            logger.warning('Network has %d connected components', len(cc_list))
            logger.warning(
                '\tLargest is size %d and all the rest are %d or smaller',
                largest_cc, max(cc_sizes))
            logger.warning('\tUsing largest connected component')

        G = max(cc_list, key=len)
    G.remove_edges_from(G.selfloop_edges())
    return G
Ejemplo n.º 5
0
def simple_two_core(G, verbose=True):
    ''' Returns simple, 2 core of given graph '''

    logger = get_logger()
    # Get simple graph
    G = simplify_graph(G, verbose)
    # Compute 2 core
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    if verbose:
        logger.info('PPI info - # Nodes: %d, # Edges: %d', num_nodes,
                    num_edges)
        logger.info('Computing 2 core')
    G = nx.k_core(G, 2)
    num_2core_nodes = G.number_of_nodes()
    num_2core_edges = G.number_of_edges()
    if verbose:
        logger.info('2 core info - # Nodes: %d, # Edges: %d', num_2core_nodes,
                    num_2core_edges)
        logger.info('2 core removed %d nodes and %d edges',
                    num_nodes - num_2core_nodes, num_edges - num_2core_edges)
    return G
Ejemplo n.º 6
0
def load_genetic_interactions_table(gi_table_file,
                                    ppi_file,
                                    sample_negs=False):
    logger = get_logger()
    GI = pd.read_csv(gi_table_file, sep='\t')

    # Explicitly filter out inconclusives...
    GI = GI[(GI['Category'] == 'SL') | (GI['Category'] == 'Non-SL')]
    SL = GI.loc[GI['Category'] == 'SL']
    pairs = set(frozenset([u, v]) for u, v in zip(SL['Gene A'], SL['Gene B']))

    # If sample negatives...
    if sample_negs:
        assert (len(GI) == len(SL))

        nodes = set(
            munk.util.simple_two_core(nx.read_edgelist(ppi_file)).nodes())
        non_sl_pairs = set()
        non_gi_data = []
        while len(non_sl_pairs) != len(pairs):
            p = frozenset(random.sample(nodes, 2))
            if p not in pairs and p not in non_sl_pairs:
                _p = tuple(p)
                non_sl_pairs.add(p)
                non_gi_data.append({
                    'Category': 'Non-SL',
                    'Score': 1,
                    'Gene A': _p[0],
                    'Gene B': _p[1]
                })

        logger.info('\tAdding %s non-SL pairs for %s total interactions' %
                    (len(non_sl_pairs), len(pairs)))
        pairs = pairs | set(non_sl_pairs)
        GI = pd.concat([GI, pd.DataFrame(non_gi_data)])
        SL = GI.loc[GI['Category'] == 'SL']

    return GI, SL, pairs
Ejemplo n.º 7
0
def run(args):
    # Create logger
    logger = get_logger(args.verbosity)

    # Load the SLs
    logger.info('[Loading genetic interactions file]')
    df = pd.read_csv(args.gi_file,
                     sep='\t',
                     dtype={
                         'Gene A': str,
                         'Gene B': str,
                         'Score': float
                     })
    pairToOutcome = dict(
        (frozenset([u, v]), c)
        for u, v, c in zip(df['Gene A'], df['Gene B'], df['Category'])
        if c != INCONCLUSIVE)

    if args.sinatra_featurize:
        # Load only SLs for SINaTRA features
        pairs = sorted(p for p, c in pairToOutcome.items() if c == SL)
        logger.info('- SINaTRA features: loaded %s SLs' % len(pairs))
    else:
        # We include SLs and non-SLs
        pairs = sorted(p for p, c in pairToOutcome.items())

        # Do some light reporting
        class_counter = Counter(pairToOutcome.values())
        n_sl = class_counter[SL]
        n_non_sl = class_counter[NON_SL]
        logger.info('- Loaded %s interactions (%s SL and %s non-SL)' %
                    (len(pairs), n_sl, n_non_sl))

    # Load the embedding
    logger.info('[Loading MUNK embedding]')
    obj = joblib.load(args.embedding_file)
    embedding = obj.get('X')
    nodes = obj.get('nodes')
    landmark_nodes = set(obj.get('landmarks'))
    node_set = set(nodes)
    n_nodes = len(nodes)
    nodeToIndex = dict(zip(nodes, range(n_nodes)))
    logger.info('- %s nodes' % n_nodes)
    logger.info('- %s landmarks' % len(landmark_nodes))

    # Restrict to pairs also in the network
    pairs = [
        p for p in pairs if all([u in node_set for u in p]) and len(p) == 2
    ]
    logger.info('- Restricting to %s pairs in the network' % len(pairs))

    if args.remove_landmarks:
        pairs = [p for p in pairs if all([u not in landmark_nodes for u in p])]
        logger.info(
            '- Restricting to %s pairs in the network without landmarks' %
            len(pairs))

    if args.sinatra_featurize:
        # For SINaTRA features, we randomly sample a set of
        # non-SLs from nodes in the network
        import random
        sl_pairs = list(pairs)
        non_sl_pairs = set()
        while len(non_sl_pairs) != len(sl_pairs):
            if args.remove_landmarks:
                p = frozenset(random.sample(node_set - landmark_nodes, 2))
            else:
                p = frozenset(random.sample(nodes, 2))
            if p not in sl_pairs and p not in non_sl_pairs:
                non_sl_pairs.add(p)

        pairToOutcome.update((p, NON_SL) for p in non_sl_pairs)
        pairs = sl_pairs + list(non_sl_pairs)

        logger.info('\tAdding %s non-SL pairs for %s total interactions' %
                    (len(non_sl_pairs), len(pairs)))

    # Determine the feature function
    if args.feature_function == ADD_FEATURES:

        def feature_function(p):
            u, v = sorted(p)
            return embedding[nodeToIndex[u]] + embedding[nodeToIndex[v]]

    elif args.feature_function == MEAN_FEATURES:

        def feature_function(p):
            u, v = sorted(p)
            return (embedding[nodeToIndex[u]] + embedding[nodeToIndex[v]]) / 2.
    else:
        raise NotImplementedError('Feature function "%s" not implemented' %
                                  args.feature_function)

    # Construct the features and outcomes and output to file
    n_pairs = len(pairs)
    X = np.array([feature_function(p) for p in pairs])
    y = np.array([pairToOutcome[p] == SL for p in pairs])

    # Output to file
    output = dict(X=X, y=y, pairs=pairs, params=vars(args))
    joblib.dump(output, args.output_file)
Ejemplo n.º 8
0
def main(args):
    log = get_logger()
    source_data_files = args.source_data_files
    target_data_files = args.target_data_files
    assert(len(source_data_files) == len(target_data_files))

    log.info('Loading homologs list from %s', args.homolog_list)
    raw_homologs = util.read_homolog_list(args.homolog_list)

    log.info('Loading source edgelist from %s', args.source_edgelist)
    source_G = nx.read_edgelist(args.source_edgelist, encoding='ascii')
    log.info('Loading target edgelist from %s', args.target_edgelist)
    target_G = nx.read_edgelist(args.target_edgelist, encoding='ascii')

    log.info('Computing MUNK embeddings real PPI networks with %d landmarks', args.n_landmarks)
    n_landmarks = args.n_landmarks

    source_G = util.simple_two_core(source_G)
    target_G = util.simple_two_core(target_G)
    homologs = util.homologs_in_graphs(source_G, target_G, raw_homologs)

    source_nodes = sorted(source_G.nodes())
    target_nodes = sorted(target_G.nodes())

    source_D = munk.regularized_laplacian(source_G, source_nodes, args.lam)
    target_D = munk.regularized_laplacian(target_G, target_nodes, args.lam)
    source_C = munk.rkhs_factor(source_D)

    source_data = dict(C=source_C, nodes=source_nodes)
    target_data = dict(D=target_D, nodes=target_nodes)

    real_diff, other_mean, hom_mean = difference_in_means(source_data,
                                    target_data,
                                    homologs,
                                    n_landmarks,
                                    source_nodes,
                                    target_nodes)
    log.info('Difference between homolog and non-homolog similarity scores %f', real_diff)

    n_permutations = len(source_data_files)
    log.info('Loading %d pairs of graphs from disk... and computing differences in means...', n_permutations)
    means = Parallel(n_jobs=args.n_jobs)(
                delayed(difference_in_means_from_files)
                       (source_data_fp,
                        target_data_fp,
                        homologs,
                        n_landmarks,
                        source_nodes,
                        target_nodes)
                for source_data_fp, target_data_fp in
                    zip(source_data_files, target_data_files))
    errs = [1 for m in means if m is None]
    print('ERRORS:', len(errs))
    means = [m for m in means if m is not None]

    rand_G_mean_diffs, rand_G_other_means, rand_G_hom_means = zip(*means)
    log.info('Mean difference in means between homologs and non-homologs scores for random graphs %f', np.mean(rand_G_mean_diffs))
    p_val, n_less_than, n_observations = one_tail_pval(real_diff, rand_G_mean_diffs)
    log.info('N permutations less than real: %d, N permutations %d', n_less_than, n_observations)
    e_size = effect_size(real_diff, rand_G_mean_diffs)

    log.info('P-value: %f', p_val)
    log.info('Effect-size: %f', e_size)
    log.info('# permutations: %d', n_permutations)
    results = dict(pval=p_val, effect_size=e_size, n_permutations=n_observations, n_less_than=n_less_than, errs=len(errs))

    log.info('Writing results to %s', args.output_file)
    with open(args.output_file, 'w') as OUT:
        json.dump(results, OUT, indent=2)
    diffs = dict( real=dict(mean_diff=real_diff,
                            non_hom_mean=other_mean,
                            hom_mean=hom_mean),
                  random=dict(mean_diffs=rand_G_mean_diffs,
                              non_hom_means=rand_G_other_means,
                              hom_means=rand_G_hom_means))
    log.info('Writing values of differences to %s', args.diffs_output_file)
    joblib.dump(diffs, args.diffs_output_file)
Ejemplo n.º 9
0
def run( args ):
    # Set up logging
    logger = get_logger(args.verbose)
    logger.info('* Loading input files...')
    
    # Load the homologs
    logger.info('\t- Loading homolog mapping...')
        
    geneAToHomologs, geneBToHomologs = \
        load_homolog_mapping_file(args.homolog_mapping_file)

    # Load the genetic interactions
    logger.info('\t- Loading genetic interaction files...')

    if args.sample_negs:
        logger.info('\t- Sampling Non-SLs...')
        
    As, Bs = [], []
    for species_A_file, species_A_ppi in \
        zip(args.species_A_files, args.species_A_ppis):
        As.append(load_genetic_interactions_table(
                    species_A_file, species_A_ppi, args.sample_negs))
    for species_B_file, species_B_ppi in \
        zip(args.species_B_files, args.species_B_ppis):
        Bs.append(load_genetic_interactions_table(
                    species_B_file, species_B_ppi, args.sample_negs))
    
    # Make predictions
    logger.info('* Making predictions for each pair of datasets...')
        
    items = []
    for (A_name, (A_GI, A_SL, A)), (B_name, (B_GI, B_SL, B)) in \
        product(zip(args.species_A_names, As), zip(args.species_B_names, Bs)):
        # Simple progress
        if args.verbose > 0:
            logger.info('\t- %s vs. %s' % (A_name, B_name))
            
        # Predict B from A
        B_edges, B_true, B_pred = predict((A_GI, A_SL, A),
                                          (B_GI, B_SL, B),
                                          geneBToHomologs,
                                          args.verbose)
        items.append({
            "Dataset A (Species)": '%s (%s)' % (A_name, args.species_names[0]),
            "Dataset B (Species)": '%s (%s)' % (B_name, args.species_names[1]),
            "Precision (True Positive Rate)": precision_score(B_true, B_pred),
            "Recall": recall_score(B_true, B_pred),
            "False Positive Rate": compute_fpr(B_true, B_pred),
            "F1 Score": f1_score(B_true, B_pred),
        })
        
        # Predict A from B
        A_edges, A_true, A_pred = predict((B_GI, B_SL, B),
                                          (A_GI, A_SL, A),
                                          geneAToHomologs,
                                          args.verbose)
        items.append({
            "Dataset A (Species)": '%s (%s)' % (B_name, args.species_names[1]),
            "Dataset B (Species)": '%s (%s)' % (A_name, args.species_names[0]),
            "Precision (True Positive Rate)": precision_score(A_true, A_pred),
            "Recall": recall_score(A_true, A_pred),
            "False Positive Rate": compute_fpr(A_true, A_pred),
            "F1 Score": f1_score(A_true, A_pred),
        })

    # Output to file
    logger.info('* Outputting to file...')
        
    df = pd.DataFrame(items)[ ['Dataset A (Species)', 'Dataset B (Species)', 
                               'Precision (True Positive Rate)', 'Recall', 
                               'False Positive Rate', 'F1 Score'] ]
    df.to_csv(args.output_file, index=0, sep='\t')
Ejemplo n.º 10
0
# Add classifier choices with subparsers
subparser = parser.add_subparsers(dest='classifier', help='Classifier')
rf_parser = subparser.add_parser('rf')
rf_parser.add_argument('-md', '--max_depth', type=int, default=None, required=False)
rf_parser.add_argument('-nt', '--n_trees', type=int, nargs='*', required=False,
                       default=[10, 100, 250, 500])

svm_parser = subparser.add_parser('svm')
svm_parser.add_argument('-sc', '--svm_Cs', type=float, required=False, nargs='*',
                        default=[0.01, 0.1, 1, 10, 100, 1000, 10000])
svm_parser.add_argument('-st', '--svm_tolerance', type=float, default=1e-3, required=False)

args = parser.parse_args(sys.argv[1:])

# Set up logger
logger = get_logger(args.verbosity)

################################################################################
# LOAD INPUT DATA
################################################################################
# Load features
logger.info('[Loading features and genetic interactions]')
a_data = joblib.load(args.feature_files[0])
b_data = joblib.load(args.feature_files[1])


X_A, y_A = np.array(a_data.get('X')), a_data.get('y')
A_pairs, A_name = np.asarray(a_data.get('pairs')), args.names[0]
X_B, y_B = np.array(b_data.get('X')), b_data.get('y')
B_pairs, B_name = np.asarray(b_data.get('pairs')), args.names[1]