Beispiel #1
0
def test_real_graph(nparts):
    logging.info('Reading author collab graph')
    author_graph = nx.read_graphml('/home/amir/az/io/spam/mgraph2.gexf')
    author_graph.name = 'author graph'
    logging.info('Reading the full author product graph')
    full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml')
    full_graph.name = 'full graph'

    proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a]
                                                                            and 'hlpful_fav_unfav' in author_graph.node[a]
                                                                            and 'vrf_prchs_fav_unfav' in author_graph.node[a]])
    # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False}
    # for a in author_graph:
    #     for feat, def_val in features.items():
    #         if feat not in author_graph.node[a]:
    #             author_graph.node[a][feat] = def_val

    # sub sample proper_author_graph
    # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3))
    # degree = proper_author_graph.degree()
    # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0])
    # author to the product reviewed by him mapping
    logging.debug('forming the product mapping')
    author_product_mapping = {}
    for a in proper_author_graph:
        author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and
                                                                 full_graph[a][p]['starRating'] >= 4]
    logging.debug('Running EM')
    ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True)
    print 'best loglikelihood: %s' % ll
    for n in partition:
        author_graph.node[n]['cLabel'] = int(partition[n])
    nx.write_gexf(author_graph, '/home/amir/az/io/spam/spam_graph_mgraph_sage_labeled.gexf')
def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True):
    graph, author_prod_map = gen_test_graph(N)
    ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel)

    print 'best loglikelihood: %s' % ll
    print partition.values()
    for n in partition:
        graph.node[n]['cLabel'] = int(partition[n])
    if write_labeled_graph:
        nx.write_graphml(graph, '/home/amir/az/io/spam/synthetic_graph_sage_labeled.graphml')
Beispiel #3
0
def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True):
    graph, author_prod_map, _ = gen_synthetic_graph(N, nparts)
    ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel)

    print 'best loglikelihood: %s' % ll
    print partition.values()
    for n in partition:
        graph.node[n]['cLabel'] = int(partition[n])
    if write_labeled_graph:
        nx.write_graphml(graph, '/home/amir/amazon-spam-review/io/synthetic_graph_labeled.graphml')
    return graph
Beispiel #4
0
def test_real_graph(nparts):
    MIN_CC_SIZE = 10  # Nodes belonging to connected components smaller than this are discarded
    logging.info('Reading author collab graph')
    author_graph = nx.read_graphml(
        '/home/amir/az/io/spam/spam_mgraph_augmented.graphml')
    author_graph.name = 'author graph'
    logging.info('Reading the full author product graph')
    full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml')
    full_graph.name = 'full graph'

    logging.info('Removing nodes which do not have all the features')
    proper_author_graph = author_graph.subgraph([
        a for a in author_graph
        if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in
        author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a]
    ])
    logging.info(
        'Keeping only nodes which belong to large connected components')
    ccs = nx.connected_components(proper_author_graph)
    ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs)
    proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs))
    # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False}
    # for a in author_graph:
    #     for feat, def_val in features.items():
    #         if feat not in author_graph.node[a]:
    #             author_graph.node[a][feat] = def_val

    # sub sample proper_author_graph
    # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3))
    # degree = proper_author_graph.degree()
    # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0])
    # author to the product reviewed by him mapping
    logging.debug('forming the product mapping')
    author_product_mapping = {}
    for a in proper_author_graph:
        author_product_mapping[a] = [
            p for p in full_graph[a] if 'starRating' in full_graph[a][p]
            and full_graph[a][p]['starRating'] >= 4
        ]
    logging.info('Running EM')
    ll, partition = HardEM.run_EM(proper_author_graph,
                                  author_product_mapping,
                                  nparts=nparts,
                                  parallel=True)
    print 'best loglikelihood: %s' % ll
    for n in partition:
        author_graph.node[n]['cLabel'] = int(partition[n])
    output_filename = 'spam_graph_mgraph_labeled.gexf'
    logging.info(
        'Writing the clusters into the graph and saving the file into %s' %
        output_filename)
    nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s' % output_filename)
def test_real_graph(nparts):
    MIN_CC_SIZE = 10        # Nodes belonging to connected components smaller than this are discarded
    logging.info('Reading author collab graph')
    author_graph = nx.read_graphml('/home/amir/az/io/spam/spam_mgraph_augmented.graphml')
    author_graph.name = 'author graph'
    logging.info('Reading the full author product graph')
    full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml')
    full_graph.name = 'full graph'

    logging.info('Removing nodes which do not have all the features')
    proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a]
                                                and 'hlpful_fav_unfav' in author_graph.node[a]
                                                and 'vrf_prchs_fav_unfav' in author_graph.node[a]])
    logging.info('Keeping only nodes which belong to large connected components')
    ccs = nx.connected_components(proper_author_graph)
    ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs)
    proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs))
    # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False}
    # for a in author_graph:
    #     for feat, def_val in features.items():
    #         if feat not in author_graph.node[a]:
    #             author_graph.node[a][feat] = def_val

    # sub sample proper_author_graph
    # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3))
    # degree = proper_author_graph.degree()
    # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0])
    # author to the product reviewed by him mapping
    logging.debug('forming the product mapping')
    author_product_mapping = {}
    for a in proper_author_graph:
        author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and
                                                                 full_graph[a][p]['starRating'] >= 4]
    logging.info('Running EM')
    ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True)
    print 'best loglikelihood: %s' % ll
    for n in partition:
        author_graph.node[n]['cLabel'] = int(partition[n])
    output_filename = 'spam_graph_mgraph_labeled.gexf'
    logging.info('Writing the clusters into the graph and saving the file into %s'%output_filename)
    nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s'%output_filename)
Beispiel #6
0
def exhaustive_ll(N, nparts, parallel=True):
    global ex_ll_graph, ex_ll_nparts, ex_ll_author_prod_map, ex_ll_ref_prt
    ex_ll_graph, ex_ll_author_prod_map, cluster_sizes = gen_synthetic_graph(N, nparts)
    N = sum(cluster_sizes)      # sum of cluster sizes is close to N but does not always match
    ex_ll_nparts = nparts
    ex_ll_graph, ex_ll_author_prod_map = HardEM._preprocess_graph_and_map(ex_ll_graph, ex_ll_author_prod_map)
    # reference partitioning
    ex_ll_ref_prt = []
    for i in range(len(cluster_sizes)):
        ex_ll_ref_prt.extend([i]*cluster_sizes[i])
    ex_ll_ref_prt = tuple(ex_ll_ref_prt)
    # all possible partitioning of at most `nparts` partitions
    partitions = itertools.chain(*[gen_partition(N, nparts_i) for nparts_i in range(1, nparts + 1)])
    logging.info('Processing %d partitions' % sum(stirling2(N, nparts_i) for nparts_i in range(1, nparts + 1)))
    if parallel:
        p = Pool()
        v = p.imap(em_ll_map, partitions)
        p.close(); p.join()
    else:
        v = itertools.imap(em_ll_map, partitions)
    v = list(v)     # since v is a generator, keeps them in a list so reading from it won't consume it
    # find the logl for the presumed correct partitioning
    ref_ll = 0
    for vv in v:
        if vv[0] == ex_ll_ref_prt:
            ref_ll = vv[1]
            break
    else:
        logging.error('The correct partitioning was not found')
    # keep only one from set of permutations with the same loglikelihood
    # v_dict = {ll: prt for prt, ll in v}
    # v = v_dict.items()
    # v.sort(key=lambda tup: tup[0], reverse=True)
    # for i in range(0, min(10, len(v))):
    #     print '#%d\t%s' % (i, v[i])
    # print '##\t%s' % ((ref_ll, ex_ll_ref_prt),)
    return v, cluster_sizes, ex_ll_graph
Beispiel #7
0
import logging

logging.basicConfig(level=logging.DEBUG, format='%(process)d\t%(asctime)s:%(levelname)s: %(message)s', datefmt='%H:%M:%S')

from pre_process import crawl_to_graph

DS_DIR = '/home/amir/pyproj/amazon-review-spam/io/same_cat_v2'

graph, membs, prods = crawl_to_graph(ds_dir=DS_DIR)
graph_orig = graph.copy()

import networkx as nx
from os import path
mgraph = nx.read_gexf(path.join(DS_DIR, '%s.gexf' % 'em_unlabeled_mgraph'))


author_product_mapping = {}
for a in mgraph:
    author_product_mapping[a] = [p for p in graph[a]]


from hardEM_gurobi import HardEM

nparts = 4
ll, partition = HardEM.run_EM(author_graph=mgraph, author_product_map=author_product_mapping, nparts=nparts*5, parallel=True, nprocs=4)

for a in mgraph:
    mgraph.node[a]['cLabel'] = int(partition[a])


nx.write_gexf(mgraph, path.join(DS_DIR, '%s.gexf' % 'em_labeled_mgraph'), version='1.2draft', encoding='us-ascii')
Beispiel #8
0
def em_ll_map(prt):
    em = HardEM(author_graph=ex_ll_graph,
                author_product_map=ex_ll_author_prod_map,
                nparts=ex_ll_nparts,
                init_partition=prt)
    return prt, em.log_likelihood()
Beispiel #9
0
def em_ll_map(prt):
    em = HardEM(author_graph=ex_ll_graph, author_product_map=ex_ll_author_prod_map, nparts=ex_ll_nparts, init_partition=prt)
    return prt, em.log_likelihood(), rand_index(prt, ex_ll_ref_prt)