def save_snowball_edgelist(self, filename): out = open(filename, 'w') modulo = 10000 total = self.number_of_edges() estimator = TimeEstimator(total / modulo) count = 0 queue = [self.random_nodes()[0]] visited = set([]) while len(queue) > 0: # impl: node = queue.pop() node = queue[0] queue = queue[1:] visited.add(node) for neigh in self.neighbors_iter(node): if not neigh in visited: queue.append(neigh) out.write('%s %s\n' % (str(node), str(neigh))) count += 1 if self.debug and count % modulo == 0: print 'INFO: %d edges dumped in save_snowball_edgelist(), total %d' % ( count, total) estimator.tick() print estimator.log_line() out.flush()
def save_snowball_edgelist_iter(self, filename): out = open(filename, 'w') self.remove_parameter_cache('visited') self.add_parameter_cache('visited') self.initialize_parameter('visited', 100000000) # 100000000 = not visited self.index_parameter_cache('visited') # TODO: no usar tiempo, es missleading y dificil de dbugearr, usar un contador!!! count = 0 modulo = 10000 total = self.number_of_edges() estimator = TimeEstimator(total / modulo) count = 0 for n, val in self.get_parameter_cache_iter('visited'): node, visited = n, val break visited = 0 self.update_parameter_cache('visited', node, visited) # < 0 means not visited while visited <= 100000000.0: # for n, val in self.get_parameter_cache_iter('visited', random=False, ascending=True): node, visited = n, val break if visited == 100000001.0 or visited == 100000000.0: break # finish connected (100000001.0) or disconnected (100000000.0) graph! self.update_parameter_cache('visited', node, 100000001) # 100000001 = visited for neigh in self.neighbors_iter(node): if self.get_parameter_cache('visited', neigh) <= 100000000: # not visited count += 1 self.update_parameter_cache('visited', neigh, count) out.write('%s %s\n' % (str(node), str(neigh))) if self.debug and count % modulo == 0: print 'INFO: %d edges dumped in save_snowball_edgelist_big(), total %d' % ( count, total) estimator.tick() print estimator.log_line() self.remove_parameter_cache('visited')
def save_edgelist(self, path, comments='#', delimiter=' ', data=False): ''' Save graph as a set of directed links with format: <nodeA> <nodeB> G : graph A NetworkX graph path : file or string File or filename to write. Filenames ending in .gz or .bz2 will be compressed. comments : string, optional The character used to indicate the start of a comment delimiter : string, optional The string uses to separate values. The default is whitespace. data : bool, optional If True write a string representation of the edge data. Save graph as a set of directed links with format: <nodeA> <nodeB> <nodeF> <nodeU> ... etc. ''' try: out = open(path, 'w') except: out = path modulo = 100000 total = self.number_of_edges() estimator = TimeEstimator(total / modulo) count = 1 for src, dst in self.edges_iter(): out.write('%s %s\n' % (str(src), str(dst))) if self.debug and count % modulo == 0: print 'INFO: %d edges dumped in save_edgelist(), total %d' % ( count, total) estimator.tick() print estimator.log_line() count += 1
def load_edgelist(self, fileobj, num=False, use_big_alphabet=False): c = 0 modulo = self.input_debug_links total = self.max_links_input estimator = TimeEstimator(total / modulo) if use_big_alphabet: base = Base() for line in fileobj: if line.strip() == '' or line.strip()[0] == '#': continue s = line.split() if num: if use_big_alphabet: src = base.base2num(s[0]) dst = base.base2num(s[1].strip()) else: src = int(s[0]) dst = int(s[1].strip()) else: src = s[0] dst = s[1].strip() self.add_edge(src, dst) c += 1 if self.debug and c % self.input_debug_links == 0: sys.stdout.write( 'INFO: INPUT load_edgelist(), link count = %d %s\n' % (c, time.ctime())) if self.debug and c % modulo == 0: print 'INFO: %d edges loaded in load_edgelist(), estimated total %d' % ( c, total) estimator.tick() print estimator.log_line() if c >= self.max_links_input: break if self.debug: sys.stdout.write( 'INFO: FINISH INPUT load_edgelist(), link count = %d\n' % c)
def run(out, strategy, coverages, max_effort): estimator = TimeEstimator(len(coverages)) for coverage, cost in izip(iter(coverages), strat(coverages, max_effort)): #print '%d %.2f %s %d' % (l, coverage, strat_name.replace('crawler','crawlr'), cost) estimator.tick() log_line = estimator.log_line() out.write('%d %.2f %s %d %f %s\n' % (l, coverage, strat_name.replace('crawler', 'crawlr'), cost, float(cost) / graph.number_of_nodes(), log_line))
def index_parameter_generic(self, param_name, param_iter_func): self.add_parameter_cache(param_name) modulo = 1000 estimator = TimeEstimator(self.number_of_nodes() / modulo) count = 1 for node, value in izip( self.nodes_iter(), param_iter_func(), ): if self.debug and count % modulo == 0: print 'INFO: %d nodes processed in index_parameter_generic, param_name %s' % ( count, param_name) estimator.tick() print estimator.log_line() self.insert_parameter_cache(param_name, node, value) count += 1 self.index_parameter_cache(param_name)
def testTimeEstimator(self): estimator = TimeEstimator(5) for _ in range(5): time.sleep(1.0) estimator.tick() self.assertAlmostEqual( estimator.time_elapsed(), 5.0, places=1) self.assertAlmostEqual( estimator.time_per_iteration(), 1.0, places=1) self.assertAlmostEqual( estimator.time_left(), 0.0, places=1) self.assertEqual( estimator.log_line(), 'INFO: 5 iterations | 5 total , 5.0 secs (0.1 mins) elapsed | 1.0 secs (0.0 mins) per it. | 0.0 secs (0.0 mins) left')
def run(q, strategy, coverages, coverage_funcs, max_effort): estimator = TimeEstimator(len(coverages) * len(coverage_funcs)) coverages = dict([(coverage_map(cov_func), coverages) for cov_func in coverage_funcs]) for cost, coverage, coverage_type in strat(coverages, max_effort): #print '%d %.2f %s %d' % (l, coverage, strat_name.replace('crawler','crawlr'), cost) if cost >= 0: # not finished yet... estimator.tick() log_line = estimator.log_line() q.put('%d %.7f %s %s %d %f %s\n' % (l, coverage, coverage_type, strat_name.replace('crawler', 'crawlr'), cost, float(cost) / graph.number_of_nodes(), log_line)) else: q.put('FINISHED')