class bnoc(object): def __init__(self): """ Initialize the bnoc app For help use: > python bnoc.py --help """ self.timing = Timing(['Snippet', 'Time [m]', 'Time [s]']) with self.timing.timeit_context_add('Pre-processing'): # Setup parse options command line current_path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) parser = args.setup_parser(current_path + '/args/bnoc.json') self.options = parser.parse_args() args.update_json(self.options) args.check_output(self.options) self.log = helper.initialize_logger(dir='log', output='log') if self.options.save_arff and (self.options.x is not None): self.log.warning( 'Warning: Arff format does not allow overlap in the first layer (parameter x).\ Please use --save_arff=False or supress x parameter.' ) sys.exit(1) self.layers = len(self.options.vertices) self.start_end = [] for layer in range(self.layers): start = sum(self.options.vertices[0:layer]) end = sum(self.options.vertices[0:layer + 1]) - 1 self.start_end.append([start, end]) if self.options.p is None or self.options.balanced is True: self.generate_balanced_probabilities() for p in self.options.p: if str(numpy.sum(round(sum(p), 1))) != str(1.0): self.log.warning( 'Warning: The sum of probabilities p1 must be equal to 1.' ) sys.exit(1) if len(self.options.communities) is None: self.options.communities = [1] * len(self.options.vertices) if self.options.x is not None and isinstance(self.options.x, int): self.options.x = [self.options.x] * self.layers if self.options.y is not None and isinstance(self.options.y, int): self.options.y = [self.options.y] * self.layers if self.options.z is not None and isinstance(self.options.z, int): self.options.z = [self.options.z] * self.layers if all(isinstance(item, tuple) for item in self.options.schema): self.options.schema = [ list(elem) for elem in self.options.schema ] if not all(isinstance(item, list) for item in self.options.schema): it = iter(self.options.schema) self.options.schema = zip(it, it) if self.options.mu is not None and isinstance( self.options.mu, (int, float)): self.options.mu = [self.options.mu] * len(self.options.schema) if self.options.dispersion is not None and isinstance( self.options.dispersion, (int, float)): self.options.dispersion = [self.options.dispersion] * len( self.options.schema) if self.options.noise is not None and isinstance( self.options.noise, (int, float)): self.options.noise = [self.options.noise] * len( self.options.schema) for layer, comm in enumerate(self.options.communities): if comm == 0: self.log.warning( 'The number of communities must be greater than zero.') sys.exit(1) if self.options.communities[layer] > self.options.vertices[ layer]: self.log.warning( 'Warning: The number of communities must be less than the number of vertices.' ) sys.exit(1) if self.options.x is not None and self.options.z is not None: if self.options.z[layer] > self.options.communities[layer]: self.log.warning( 'Warning: Number of vertices of overlapping communities must be less than \ the number of communities in all layers.') sys.exit(1) if sum(self.options.x) > 0 and sum(self.options.z) == 0: self.options.z = [2] * len(self.options.x) def add_noise(self, matrix, noise): """ Insert a noise in adjacent matrix Noise or Threshold in (0,1] """ # Removing a fraction of inter-community edges [numpy.random.seed(1)] num_samples = numpy.count_nonzero(matrix) Z = [False] while not any(Z): # while all elements are 'False' Z = numpy.random.rand(num_samples) < noise # Z = numpy.random.uniform(0.0, 1.0, num_samples) < noise Y = matrix[matrix > 0] removed_weights = Y[Z] Y[Z] = 0 if self.options.hard: matrix[matrix > 0] = Y # Adding a fraction of intra-community edges num_samples = numpy.count_nonzero(matrix == 0) Z = [False] while not any(Z): # while all elements are 'False' # Z = numpy.random.rand(num_samples) < noise Z = numpy.random.uniform(0.0, 1.0, num_samples) < noise Y = matrix[matrix == 0] _mean = numpy.mean(removed_weights, dtype=numpy.float64) # _mean = numpy.median(removed_weights) if not self.options.hard: removed_weights = [_mean] * len(removed_weights) removed_weights = list(removed_weights) + ( [0] * (numpy.count_nonzero(Z) - len(removed_weights))) Y[Z] = numpy.random.choice(removed_weights, numpy.count_nonzero(Z)) matrix[matrix == 0] = Y return matrix def generate_balanced_probabilities(self): """ Generates a list of probabilities for each class when the probabilities are not given by the user or the balanced flag is on. """ if self.options.p is None: self.options.p = empty_lists = [[] for i in range(self.layers)] for layer in range(self.layers): avg = float(1.0 / self.options.communities[layer]) self.options.p[layer] = [avg] * self.options.communities[layer] self.options.p[layer][-1] = float(1.0 - sum(self.options.p[layer][:-1])) def create_vertices_and_communities(self): """ Creates a list that gives the class for each element in the positioning order for one type of element """ self.membership = [[] for i in range(self.layers)] for layer in range(self.layers): for itr in range(max_itr): self.membership[layer] = numpy.random.choice( self.options.communities[layer], size=self.options.vertices[layer], replace=True, p=self.options.p[layer]) self.membership[layer] = sorted(self.membership[layer]) unique_row = numpy.unique(self.membership[layer]) if len(unique_row) == self.options.communities[layer]: break if itr == max_itr: self.log.warning( 'Warning: Convergence failure, reduce the number of communities or run again.' ) sys.exit(1) def create_cover(self): """ Creates a list of list that maps from each community to the nodes in it """ self.unique_comms = [0] * self.layers self.cover = [[] for i in range(self.layers)] for layer in range(self.layers): self.unique_comms[layer] = list( range(self.options.communities[layer])) self.cover[layer] = numpy.empty( (self.options.communities[layer], 0)).tolist() for vertex, comm in enumerate(self.membership[layer]): self.cover[layer][comm].append(vertex + self.start_end[layer][0]) def select_overlapping_vertices(self): """ Select x vertices to be member of z communities, as expected by model """ self.overlap = [[] for i in range(self.layers)] if self.options.x is not None and sum(self.options.x) > 0: for layer in range(self.layers): self.overlap[layer] = numpy.random.choice( range(self.start_end[layer][0], self.start_end[layer][1] + 1), self.options.x[layer], replace=False) for vertex in self.overlap[layer]: comms = copy.copy(self.unique_comms[layer]) comms.remove( self.membership[layer][vertex - self.start_end[layer][0]]) random.shuffle(comms) # Update communities for comm in comms[:(self.options.z[layer] - 1)]: self.cover[layer][comm].append(vertex) def create_biadj_matrix(self, l0, l1, dispersion, mu): """ Create an unweighted adjacenty matrix with community structure. """ # Create a empty biparte matrix = numpy.zeros( (self.options.vertices[l0], self.options.vertices[l1]), dtype=numpy.float64) unique_comms = [self.unique_comms[l0], self.unique_comms[l1]] _max = unique_comms.index(max(unique_comms, key=len)) _min = unique_comms.index(min(unique_comms, key=len)) # Connect all vertices in each module multiplier = math.ceil( len(unique_comms[_max]) / float(len(unique_comms[_min]))) unique_comms[_min] = unique_comms[_min] * int(multiplier) unique_comms[_min] = unique_comms[_min][:len(unique_comms[_max])] for index in range(len(unique_comms[_max])): for u in self.cover[l0][unique_comms[0][index]]: for v in self.cover[l1][unique_comms[1][index]]: matrix[u - self.start_end[l0][0], v - self.start_end[l1][0]] = 1 # Make a large negative binomial distribution num_samples = numpy.count_nonzero(matrix) # prob = dispersion / (dispersion + mu) # prob = ((mu + dispersion * mu ** 2) - mu) / (mu + dispersion * mu ** 2) # from scipy.stats import nbinom # distribution = nbinom.rvs(dispersion, prob, size=num_samples) distribution = numpy.random.negative_binomial(dispersion, 1 - mu, num_samples) if self.options.normalize: distribution = distribution / numpy.linalg.norm(distribution) # numpy.set_printoptions(threshold=numpy.nan) # print distribution matrix[matrix > 0] = distribution return matrix def save_text(self, output): # Save type if self.options.save_type: with open(output + '.type', 'w+') as f: for layer in range(self.layers): for i in range(self.options.vertices[layer]): f.write(str(layer) + '\n') # Save overlap if self.options.save_overlap: for layer in range(self.layers): if len(self.overlap[layer]) > 0: with open(output + '.overrow', 'w+') as f: writer = csv.writer(f, delimiter=' ') writer.writerow(self.overlap[layer]) # Save cover if self.options.save_cover: for layer in range(self.layers): with open(output + '-layer-' + str(layer) + '.cover', 'w+') as f: writer = csv.writer(f, delimiter=' ') for values in self.cover[layer]: writer.writerow(values) # Save membership if self.options.save_membership: with open(output + '.membership', 'w+') as f: writer = csv.writer(f, delimiter=' ') for layer in range(self.layers): clusters = self.cover[layer] _clusters = [list(cluster) for cluster in clusters] _n = max( max(cluster) + 1 for cluster in _clusters if cluster) result = [[] for _ in range(_n)] for idx, cluster in enumerate(clusters): for item in cluster: result[item].append(idx) for sublist in result: if sublist: writer.writerow(sublist) # Save bipartite network if self.options.save_ncol or self.options.save_gml or self.options.save_arff: edgelist = '' dict_edges = dict() for key, matrix in enumerate(self.matrices): l0 = self.options.schema[key][0] l1 = self.options.schema[key][1] for i in range(matrix.shape[0]): for j in range(matrix.shape[1]): if matrix[i, j] != 0: u = i + self.start_end[l0][0] v = j + self.start_end[l1][0] if self.options.unweighted is False: weight = numpy.around(matrix[i, j], decimals=3) else: weight = 1.0 edgelist += '%s %s %s\n' % (u, v, weight) dict_edges[(u, v)] = float(weight) # Save ncol if self.options.save_ncol: with open(output + '.ncol', 'w+') as f: f.write(edgelist) # Save arff if self.options.save_arff: self.log.warning('Arff format still under development.') sys.exit(1) def save_npy(self, output): # Save npy if self.options.save_ncol: numpy.save(output + '-matrices.npy', self.matrices) # Save type if self.options.save_type: type = [] for layer, vertices in enumerate(self.options.vertices): type.extend([layer] * vertices) numpy.save(output + '-type.npy', type) # Save overlap if self.options.save_overlap: numpy.save(output + '-overlap.npy', self.overlap) # Save cover if self.options.save_cover: numpy.save(output + '-cover.npy', self.cover) # Save membership if self.options.save_membership: for layer in range(self.layers): clusters = self.cover[layer] _clusters = [list(cluster) for cluster in clusters] _n = max(max(cluster) + 1 for cluster in _clusters if cluster) result = [[] for _ in range(_n)] for idx, cluster in enumerate(clusters): for item in cluster: result[item].append(idx) numpy.save(output + '-membership.npy', result) def build(self): """ Runs the application. """ # Graph construction with self.timing.timeit_context_add('Build BNOC'): self.create_vertices_and_communities() self.create_cover() self.select_overlapping_vertices() self.matrices = [] for index, e in enumerate(self.options.schema): matrix = self.create_biadj_matrix( e[0], e[1], self.options.dispersion[index], self.options.mu[index]) if self.options.noise[index] > 0.0: matrix = self.add_noise(matrix, self.options.noise[index]) self.matrices.append(matrix) # Save with self.timing.timeit_context_add('Save'): # Save json inf file output = self.options.output with open(output + '-inf.json', 'w+') as f: d = { 'output': self.options.output, 'directory': self.options.directory, 'extension': 'ncol', 'vertices': self.options.vertices, 'communities': self.options.communities, 'x': self.options.x, 'z': self.options.z, 'p': self.options.p, 'balanced': self.options.balanced, 'd': self.options.dispersion, 'mu': self.options.mu, 'noise': self.options.noise, 'unweighted': self.options.unweighted, 'normalize': self.options.normalize, 'conf': self.options.conf, 'show_timing': self.options.show_timing, 'save_timing_csv': self.options.save_timing_csv, 'save_timing_json': self.options.save_timing_json, 'unique_key': self.options.unique_key, 'edges': 0 } for matrix in self.matrices: d['edges'] += numpy.count_nonzero(matrix) json.dump(d, f, indent=4) if self.options.output_npy: self.save_npy(output) if self.options.output_text: self.save_text(output) if self.options.show_timing: self.timing.print_tabular() if self.options.save_timing_csv: self.timing.save_csv(output + '-timing.csv') if self.options.save_timing_json: self.timing.save_json(output + '-timing.csv')
def main(): """ Main entry point for the application when run from the command line. """ # Timing instance timing = Timing(['Snippet', 'Time [m]', 'Time [s]']) with timing.timeit_context_add('Pre-processing'): # Setup parse options command line current_path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) parser = args.setup_parser(current_path + '/args/mfbn.json') options = parser.parse_args() args.update_json(options) args.check_output(options) if options.input and options.vertices is None: print('Vertices are required when input is given.') sys.exit(1) # Load bipartite graph with timing.timeit_context_add('Load graph'): source_graph = MGraph() source_graph.load(options.input, options.vertices) # Coarsening with timing.timeit_context_add('Coarsening'): kwargs = dict(reduction_factor=options.reduction_factor, max_levels=options.max_levels, matching=options.matching, similarity=options.similarity, itr=options.itr, upper_bound=options.upper_bound, gmv=options.gmv, tolerance=options.tolerance, reverse=options.reverse, seed_priority=options.seed_priority, threads=options.threads) coarsening = Coarsening(source_graph, **kwargs) coarsening.run() # Save with timing.timeit_context_add('Save'): output = options.output for index, obj in enumerate( zip(coarsening.hierarchy_levels, coarsening.hierarchy_graphs)): level, coarsened_graph = obj index += 1 if options.save_conf or options.show_conf: d = { 'source_input': options.input, 'source_vertices': source_graph['vertices'], 'source_vcount': source_graph.vcount(), 'source_ecount': source_graph.ecount(), 'coarsened_ecount': coarsened_graph.ecount(), 'coarsened_vcount': coarsened_graph.vcount(), 'coarsened_vertices': coarsened_graph['vertices'], 'achieved_levels': coarsened_graph['level'], 'reduction_factor': options.reduction_factor, 'max_levels': options.max_levels, 'similarity': options.similarity, 'matching': options.matching, 'upper_bound': options.upper_bound, 'gmv': options.gmv, 'itr': options.itr, 'level': level } if options.save_conf: with open(output + '-' + str(index) + '-info.json', 'w+') as f: json.dump(d, f, indent=4) if options.show_conf: print(json.dumps(d, indent=4)) if options.save_ncol: coarsened_graph.write(output + '-' + str(index) + '.ncol', format='ncol') if options.save_source: with open(output + '-' + str(index) + '.source', 'w+') as f: for v in coarsened_graph.vs(): f.write(' '.join(map(str, v['source'])) + '\n') if options.save_membership: membership = [0] * (source_graph['vertices'][0] + source_graph['vertices'][1]) for v in coarsened_graph.vs(): for source in v['source']: membership[source] = v.index numpy.savetxt(output + '-' + str(index) + '.membership', membership, fmt='%d') if options.save_predecessor: with open(output + '-' + str(index) + '.predecessor', 'w+') as f: for v in coarsened_graph.vs(): f.write(' '.join(map(str, v['predecessor'])) + '\n') if options.save_successor: numpy.savetxt(output + '-' + str(index) + '.successor', coarsened_graph.vs['successor'], fmt='%d') if options.save_weight: numpy.savetxt(output + '-' + str(index) + '.weight', coarsened_graph.vs['weight'], fmt='%d') if options.save_gml: del coarsened_graph['adjlist'] del coarsened_graph['similarity'] coarsened_graph['layers'] = str(coarsened_graph['layers']) coarsened_graph['vertices'] = ','.join( map(str, coarsened_graph['vertices'])) coarsened_graph['level'] = ','.join( map(str, coarsened_graph['level'])) coarsened_graph.vs['name'] = map( str, range(0, coarsened_graph.vcount())) coarsened_graph.vs['type'] = map(str, coarsened_graph.vs['type']) coarsened_graph.vs['weight'] = map( str, coarsened_graph.vs['weight']) coarsened_graph.vs['successor'] = map( str, coarsened_graph.vs['successor']) for v in coarsened_graph.vs(): v['source'] = ','.join(map(str, v['source'])) v['predecessor'] = ','.join(map(str, v['predecessor'])) coarsened_graph.write(output + '-' + str(index) + '.gml', format='gml') if not options.save_hierarchy: break if options.show_timing: timing.print_tabular() if options.save_timing_csv: timing.save_csv(output + '-timing.csv') if options.save_timing_json: timing.save_json(output + '-timing.json')
def main(): """ Main entry point for the application when run from the command line. """ # Timing instanciation timing = Timing(['Snippet', 'Time [m]', 'Time [s]']) with timing.timeit_context_add('Pre-processing'): # Setup parse options command line current_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parser = args.setup_parser(current_path + '/args/mdr.json') options = parser.parse_args() args.update_json(options) args.check_output(options) # Log instanciation log = helper.initialize_logger(dir='log', output='log') if options.input and options.vertices is None: log.warning('Vertices are required when input is given.') sys.exit(1) # Create default values for optional parameters if options.reduction_factor is None: options.reduction_factor = 0.5 if options.max_levels is None: options.max_levels = 3 if options.matching is None: options.matching = 'greedy_seed_twohops' if options.similarity is None: options.similarity = 'weighted_common_neighbors' # Validation of matching method valid_matching = ['gmb', 'rgmb', 'hem', 'lem', 'rm'] if options.matching.lower() not in valid_matching: log.warning('Matching method is unvalid.') sys.exit(1) # Validation of input extension valid_input = ['.arff', '.dat'] if options.extension not in valid_input: log.warning('Input is unvalid.') sys.exit(1) # Validation of similarity measure valid_similarity = ['common_neighbors', 'weighted_common_neighbors', 'salton', 'preferential_attachment', 'jaccard', 'adamic_adar', 'resource_allocation', 'sorensen', 'hub_promoted', 'hub_depressed', 'leicht_holme_newman', 'weighted_jaccard'] if options.similarity.lower() not in valid_similarity: log.warning('Similarity misure is unvalid.') sys.exit(1) options.vertices = map(int, options.vertices) options.max_levels = int(options.max_levels) options.reduction_factor = float(options.reduction_factor) # Load bipartite graph with timing.timeit_context_add('Load'): if options.extension == '.arff': graph = helperigraph.load_csr(options.input) elif options.extension == '.dat': graph = helperigraph.load_dat(options.input, skip_last_column=options.skip_last_column, skip_rows=options.skip_rows) graph['level'] = 0 # Coarsening with timing.timeit_context_add('Coarsening'): hierarchy_graphs = [] hierarchy_levels = [] while not graph['level'] == options.max_levels: matching = range(graph.vcount()) levels = graph['level'] levels += 1 graph['similarity'] = getattr(Similarity(graph, graph['adjlist']), options.similarity) start = sum(graph['vertices'][0:1]) end = sum(graph['vertices'][0:1 + 1]) if options.matching in ['hem', 'lem', 'rm']: one_mode_graph = graph.weighted_one_mode_projection(vertices) matching_method = getattr(one_mode_graph, options.matching) matching_method(matching, reduction_factor=options.reduction_factor) else: matching_method = getattr(graph, options.matching) matching_method(range(start, end), matching, reduction_factor=options.reduction_factor) coarse = graph.contract(matching) coarse['level'] = levels graph = coarse if options.save_hierarchy or (graph['level'] == options.max_levels): hierarchy_graphs.append(graph) hierarchy_levels.append(levels) # Save with timing.timeit_context_add('Save'): output = options.output for index, obj in enumerate(reversed(zip(hierarchy_levels, hierarchy_graphs))): levels, graph = obj if options.save_conf: with open(output + '-' + str(index) + '.conf', 'w+') as f: d = {} d['source_filename'] = options.input d['source_v0'] = options.vertices[0] d['source_v1'] = options.vertices[1] d['source_vertices'] = options.vertices[0] + options.vertices[1] d['edges'] = graph.ecount() d['vertices'] = graph.vcount() d['reduction_factor'] = options.reduction_factor d['max_levels'] = options.max_levels d['similarity'] = options.similarity d['matching'] = options.matching d['levels'] = levels for layer in range(graph['layers']): vcount = str(len(graph.vs.select(type=layer))) attr = 'v' + str(layer) d[attr] = vcount json.dump(d, f, indent=4) if options.save_ncol: graph.write(output + '-' + str(index) + '.ncol', format='ncol') if options.save_source: with open(output + '-' + str(index) + '.source', 'w+') as f: for v in graph.vs(): f.write(' '.join(map(str, v['source'])) + '\n') if options.save_predecessor: with open(output + '-' + str(index) + '.predecessor', 'w+') as f: for v in graph.vs(): f.write(' '.join(map(str, v['predecessor'])) + '\n') if options.save_successor: numpy.savetxt(output + '-' + str(index) + '.successor', graph.vs['successor'], fmt='%d') if options.save_weight: numpy.savetxt(output + '-' + str(index) + '.weight', graph.vs['weight'], fmt='%d') if options.save_adjacency: numpy.savetxt(output + '-' + str(index) + '.dat', helperigraph.biajcent_matrix(graph), fmt='%.2f') if options.save_gml: del graph['adjlist'] del graph['similarity'] graph['layers'] = str(graph['layers']) graph['vertices'] = ','.join(map(str, graph['vertices'])) graph['level'] = str(graph['level']) graph.vs['name'] = map(str, range(0, graph.vcount())) graph.vs['type'] = map(str, graph.vs['type']) graph.vs['weight'] = map(str, graph.vs['weight']) graph.vs['successor'] = map(str, graph.vs['successor']) for v in graph.vs(): v['source'] = ','.join(map(str, v['source'])) v['predecessor'] = ','.join(map(str, v['predecessor'])) graph.write(output + '-' + str(index) + '.gml', format='gml') if not options.save_hierarchy: break if options.show_timing: timing.print_tabular() if options.save_timing_csv: timing.save_csv(output + '-timing.csv') if options.save_timing_json: timing.save_json(output + '-timing.csv')