def test_real_graph(nparts): logging.info('Reading author collab graph') author_graph = nx.read_graphml('/home/amir/az/io/spam/mgraph2.gexf') author_graph.name = 'author graph' logging.info('Reading the full author product graph') full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml') full_graph.name = 'full graph' proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a]]) # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False} # for a in author_graph: # for feat, def_val in features.items(): # if feat not in author_graph.node[a]: # author_graph.node[a][feat] = def_val # sub sample proper_author_graph # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3)) # degree = proper_author_graph.degree() # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0]) # author to the product reviewed by him mapping logging.debug('forming the product mapping') author_product_mapping = {} for a in proper_author_graph: author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and full_graph[a][p]['starRating'] >= 4] logging.debug('Running EM') ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True) print 'best loglikelihood: %s' % ll for n in partition: author_graph.node[n]['cLabel'] = int(partition[n]) nx.write_gexf(author_graph, '/home/amir/az/io/spam/spam_graph_mgraph_sage_labeled.gexf')
def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True): graph, author_prod_map = gen_test_graph(N) ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel) print 'best loglikelihood: %s' % ll print partition.values() for n in partition: graph.node[n]['cLabel'] = int(partition[n]) if write_labeled_graph: nx.write_graphml(graph, '/home/amir/az/io/spam/synthetic_graph_sage_labeled.graphml')
def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True): graph, author_prod_map, _ = gen_synthetic_graph(N, nparts) ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel) print 'best loglikelihood: %s' % ll print partition.values() for n in partition: graph.node[n]['cLabel'] = int(partition[n]) if write_labeled_graph: nx.write_graphml(graph, '/home/amir/amazon-spam-review/io/synthetic_graph_labeled.graphml') return graph
def test_real_graph(nparts): MIN_CC_SIZE = 10 # Nodes belonging to connected components smaller than this are discarded logging.info('Reading author collab graph') author_graph = nx.read_graphml( '/home/amir/az/io/spam/spam_mgraph_augmented.graphml') author_graph.name = 'author graph' logging.info('Reading the full author product graph') full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml') full_graph.name = 'full graph' logging.info('Removing nodes which do not have all the features') proper_author_graph = author_graph.subgraph([ a for a in author_graph if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a] ]) logging.info( 'Keeping only nodes which belong to large connected components') ccs = nx.connected_components(proper_author_graph) ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs) proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs)) # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False} # for a in author_graph: # for feat, def_val in features.items(): # if feat not in author_graph.node[a]: # author_graph.node[a][feat] = def_val # sub sample proper_author_graph # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3)) # degree = proper_author_graph.degree() # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0]) # author to the product reviewed by him mapping logging.debug('forming the product mapping') author_product_mapping = {} for a in proper_author_graph: author_product_mapping[a] = [ p for p in full_graph[a] if 'starRating' in full_graph[a][p] and full_graph[a][p]['starRating'] >= 4 ] logging.info('Running EM') ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True) print 'best loglikelihood: %s' % ll for n in partition: author_graph.node[n]['cLabel'] = int(partition[n]) output_filename = 'spam_graph_mgraph_labeled.gexf' logging.info( 'Writing the clusters into the graph and saving the file into %s' % output_filename) nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s' % output_filename)
def test_real_graph(nparts): MIN_CC_SIZE = 10 # Nodes belonging to connected components smaller than this are discarded logging.info('Reading author collab graph') author_graph = nx.read_graphml('/home/amir/az/io/spam/spam_mgraph_augmented.graphml') author_graph.name = 'author graph' logging.info('Reading the full author product graph') full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml') full_graph.name = 'full graph' logging.info('Removing nodes which do not have all the features') proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a]]) logging.info('Keeping only nodes which belong to large connected components') ccs = nx.connected_components(proper_author_graph) ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs) proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs)) # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False} # for a in author_graph: # for feat, def_val in features.items(): # if feat not in author_graph.node[a]: # author_graph.node[a][feat] = def_val # sub sample proper_author_graph # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3)) # degree = proper_author_graph.degree() # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0]) # author to the product reviewed by him mapping logging.debug('forming the product mapping') author_product_mapping = {} for a in proper_author_graph: author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and full_graph[a][p]['starRating'] >= 4] logging.info('Running EM') ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True) print 'best loglikelihood: %s' % ll for n in partition: author_graph.node[n]['cLabel'] = int(partition[n]) output_filename = 'spam_graph_mgraph_labeled.gexf' logging.info('Writing the clusters into the graph and saving the file into %s'%output_filename) nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s'%output_filename)
def exhaustive_ll(N, nparts, parallel=True): global ex_ll_graph, ex_ll_nparts, ex_ll_author_prod_map, ex_ll_ref_prt ex_ll_graph, ex_ll_author_prod_map, cluster_sizes = gen_synthetic_graph(N, nparts) N = sum(cluster_sizes) # sum of cluster sizes is close to N but does not always match ex_ll_nparts = nparts ex_ll_graph, ex_ll_author_prod_map = HardEM._preprocess_graph_and_map(ex_ll_graph, ex_ll_author_prod_map) # reference partitioning ex_ll_ref_prt = [] for i in range(len(cluster_sizes)): ex_ll_ref_prt.extend([i]*cluster_sizes[i]) ex_ll_ref_prt = tuple(ex_ll_ref_prt) # all possible partitioning of at most `nparts` partitions partitions = itertools.chain(*[gen_partition(N, nparts_i) for nparts_i in range(1, nparts + 1)]) logging.info('Processing %d partitions' % sum(stirling2(N, nparts_i) for nparts_i in range(1, nparts + 1))) if parallel: p = Pool() v = p.imap(em_ll_map, partitions) p.close(); p.join() else: v = itertools.imap(em_ll_map, partitions) v = list(v) # since v is a generator, keeps them in a list so reading from it won't consume it # find the logl for the presumed correct partitioning ref_ll = 0 for vv in v: if vv[0] == ex_ll_ref_prt: ref_ll = vv[1] break else: logging.error('The correct partitioning was not found') # keep only one from set of permutations with the same loglikelihood # v_dict = {ll: prt for prt, ll in v} # v = v_dict.items() # v.sort(key=lambda tup: tup[0], reverse=True) # for i in range(0, min(10, len(v))): # print '#%d\t%s' % (i, v[i]) # print '##\t%s' % ((ref_ll, ex_ll_ref_prt),) return v, cluster_sizes, ex_ll_graph
import logging logging.basicConfig(level=logging.DEBUG, format='%(process)d\t%(asctime)s:%(levelname)s: %(message)s', datefmt='%H:%M:%S') from pre_process import crawl_to_graph DS_DIR = '/home/amir/pyproj/amazon-review-spam/io/same_cat_v2' graph, membs, prods = crawl_to_graph(ds_dir=DS_DIR) graph_orig = graph.copy() import networkx as nx from os import path mgraph = nx.read_gexf(path.join(DS_DIR, '%s.gexf' % 'em_unlabeled_mgraph')) author_product_mapping = {} for a in mgraph: author_product_mapping[a] = [p for p in graph[a]] from hardEM_gurobi import HardEM nparts = 4 ll, partition = HardEM.run_EM(author_graph=mgraph, author_product_map=author_product_mapping, nparts=nparts*5, parallel=True, nprocs=4) for a in mgraph: mgraph.node[a]['cLabel'] = int(partition[a]) nx.write_gexf(mgraph, path.join(DS_DIR, '%s.gexf' % 'em_labeled_mgraph'), version='1.2draft', encoding='us-ascii')
def em_ll_map(prt): em = HardEM(author_graph=ex_ll_graph, author_product_map=ex_ll_author_prod_map, nparts=ex_ll_nparts, init_partition=prt) return prt, em.log_likelihood()
def em_ll_map(prt): em = HardEM(author_graph=ex_ll_graph, author_product_map=ex_ll_author_prod_map, nparts=ex_ll_nparts, init_partition=prt) return prt, em.log_likelihood(), rand_index(prt, ex_ll_ref_prt)