def req(): # Get URLs from a text file, remove white space. db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() articles = db_worker_view.retrieve_all_articles() #articles = db_worker_view.retrieve_all_articles_questionmark() # measure time start = time.clock() start_time_iteration = start iteration_number = 483 for i, article in enumerate(articles): # print some progress if i % 10000 == 0: #print time for the iteration seconds = time.clock() - start_time_iteration m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % (i, h, m, s) start_time_iteration = time.clock() iteration_number += 1 # Thread pool. # Blocks other threads (more than the set limit). pool.acquire(blocking=True) # Create a new thread. # Pass each URL (i.e. u parameter) to the worker function. t = threading.Thread(target=worker, args=(MEDIAWIKI_API_ENDPOINT+urllib.quote(article['title'])+'/'+str(article['rev_id']), article, iteration_number)) # Start the newly create thread. t.start() seconds = time.clock() - start m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Total time: %d:%02d:%02d" % (h, m, s)
def _evaluate_disambiguations(self): INPUT_FILE = self.read_path('Please enter the path of the samples file [.xml]', default='./tmp/samples.xml') LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/evaluation3.log', must_exist=False) CONTINUE = self.read_yes_no('This process might take from several minutes to several hours.\nDo you want to continue?') if not CONTINUE: print '# Aborting...' return print '# Starting evaluation...' # setup logging LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s' logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w') # connecting to db db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) work_view = db.get_work_view() # measure time start = time.clock() evaluator = Evaluator(INPUT_FILE, work_view) result = evaluator.evaluate_disambiguations() seconds = round (time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60) print 'Evaluation done! - precision: %d%%, recall: %d%%' % (round(result['precision']*100), round(result['recall']*100))
def export_data_unresolved(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() connection = db_work_view._db_connection df_clickstream = pn.read_csv( '/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False) df_clickstream['prev'] = df_clickstream['prev'].str.replace('_', ' ') df_clickstream['curr'] = df_clickstream['curr'].str.replace('_', ' ') df_clickstream['curr_unresolved'] = df_clickstream[ 'curr_unresolved'].str.replace('_', ' ') df_redirects_candidates = pn.read_sql( 'select * from redirects_candidates_sample', connection) sample_unresoleved = pn.merge( df_redirects_candidates, df_clickstream, how='left', left_on=['source_article_name', 'target_article_name'], right_on=['prev', 'curr_unresolved']) sample_unresoleved['n'].fillna(0, inplace=True) sample_unresoleved.to_csv( '/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t', encoding="utf-8")
def run(self): self.print_title('This is the interactive runner program') self.create_tmp_if_not_exists() INPUT_FILE = self.read_path('Please enter the path of the input file [.txt]', default='./tmp/input.txt') OUTPUT_FILE = self.read_path('Please enter the path of the output file [.html]', default='./tmp/output.html', must_exist=False) LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/runner.log', must_exist=False) print '# Starting runner...' # setup logging LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s' logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w') # measure time start = time.clock() # connect to db db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) work_view = db.get_work_view() # read input f = open(INPUT_FILE, 'r') text = f.read() text = text.replace(' ', ' ') f.close() # create dummy article article = {} article['type'] = 'article' article['id'] = None article['title'] = None article['text'] = text article['links'] = [] # identify links link_detector = LinkDetector(work_view) link_detector.detect_links(article) # identify terms #term_identifier = TermIdentifier() #article = term_identifier.identify_terms(text) # find possible meanings meaning_finder = MeaningFinder(work_view) meaning_finder.find_meanings(article) # calculate relatedness relatedness_calculator = RelatednessCalculator(work_view) # decide for meaning decider = Decider(relatedness_calculator) decider.decide(article) # output results html_outputter = HTMLOutputter() html_outputter.output(article, OUTPUT_FILE) seconds = round (time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
def pickle_aggregated_counts_distribution(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor results = {} try: cursor.execute('select sum(counts) from clickstream_derived_internal_links group by prev_id;') result = cursor.fetchall() results['source_article']=result except MySQLdb.Error, e: print e
def pickle_category_counts_distribution(): results = {} db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']: try: cursor.execute('select counts from link_features where counts is not null and visual_region=%s;', (category,)) result = cursor.fetchall() results[category] = result except MySQLdb.Error, e: print e
def links_heatmap(): #http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set # Get URLs from a text file, remove white space. print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_coords() print 'coord loaded' x=[] y=[] page_lenghts = db_worker_view.retrieve_all_page_lengths() print 'lenghts loaded' for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']]) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) heatmap, xedges, yedges = np.histogram2d(x, y, bins=100) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] fig_size = (2.4, 2) #fig_size = (3.5, 3) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Log Normalized") plt.show() plt.savefig('output/links_heatmap_lognormed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_heatmap_normed_self_loop.pdf') print "done"
def pickle_aggregated_counts_distribution(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor results = {} try: cursor.execute( 'select sum(counts) from clickstream_derived_internal_links group by prev_id;' ) result = cursor.fetchall() results['source_article'] = result except MySQLdb.Error, e: print e
def pickle_category_counts_distribution(): results = {} db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']: try: cursor.execute( 'select counts from link_features where counts is not null and visual_region=%s;', (category, )) result = cursor.fetchall() results[category] = result except MySQLdb.Error, e: print e
def pickle_redirects_ids(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() redirects_list_id = [] with open(HOME+"data/candidate_articles.tsv") as f: next(f) for line in f: line = line.strip().split('\t') #look up id tmp = db_work_view.resolve_title(line[0].replace('_',' ')) #print tmp if tmp is not None: redirects_list_id.append(tmp['id']) pickle.dump(redirects_list_id, open(SSD_HOME+"pickle/redirects_ids.obj", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
def clicks_heatmap_total(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_coords_clicks() print 'coord loaded' links = {} x = [] y = [] values = [] for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(coord['page_length']) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) values.append(float(coord['counts'])) heatmap, xedges, yedges = np.histogram2d(x, y, bins=100, weights=values) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0] ] fig_size = (2.4, 2) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Clicks Heatmap Log Normalized") plt.show() plt.savefig('output/clicks_heatmap_lognormed_self_loop_total.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Clicks Heatmap Normalized") plt.show() plt.savefig('output/clicks_heatmap_normed_self_loop_total.pdf') print "done"
def pickle_redirects_ids(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() redirects_list_id = [] with open(HOME + "data/candidate_articles.tsv") as f: next(f) for line in f: line = line.strip().split('\t') #look up id tmp = db_work_view.resolve_title(line[0].replace('_', ' ')) #print tmp if tmp is not None: redirects_list_id.append(tmp['id']) pickle.dump(redirects_list_id, open(SSD_HOME + "pickle/redirects_ids.obj", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
def export_data_unresolved(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() connection = db_work_view._db_connection df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False) df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ') df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ') df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ') df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection) sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved']) sample_unresoleved['n'].fillna(0, inplace=True) sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
def req(): # Get URLs from a text file, remove white space. db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() articles = db_worker_view.retrieve_all_articles() #articles = db_worker_view.retrieve_all_articles_questionmark() # measure time start = time.clock() start_time_iteration = start iteration_number = 483 for i, article in enumerate(articles): # print some progress if i % 10000 == 0: #print time for the iteration seconds = time.clock() - start_time_iteration m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % ( i, h, m, s) start_time_iteration = time.clock() iteration_number += 1 # Thread pool. # Blocks other threads (more than the set limit). pool.acquire(blocking=True) # Create a new thread. # Pass each URL (i.e. u parameter) to the worker function. t = threading.Thread( target=worker, args=(MEDIAWIKI_API_ENDPOINT + urllib.quote(article['title']) + '/' + str(article['rev_id']), article, iteration_number)) # Start the newly create thread. t.start() seconds = time.clock() - start m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Total time: %d:%02d:%02d" % (h, m, s)
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * __author__ = 'dimitrovdr' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() wikipedia = Graph() for link in db_work_view.retrieve_all_internal_transitions(): wikipedia.add_edge(link['from'], link['to']) #print 'from %s, to %s', link['from'], link['to'] #wikipedia.save("output/transitionsnetwork.xml.gz") # filter all nodes that have no edges transitions_network = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "clust" transitions_network.vertex_properties["local_clust"] = local_clustering(transitions_network) print "page_rank" transitions_network.vertex_properties["page_rank"] = pagerank(transitions_network) print "eigenvector_centr" eigenvalue, eigenvectorcentr = eigenvector(transitions_network) transitions_network.vertex_properties["eigenvector_centr"] = eigenvectorcentr
def links_heatmap_rel_prob(): #http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set # Get URLs from a text file, remove white space. print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_coords() x=[] y=[] page_lenghts = db_worker_view.retrieve_all_page_lengths() for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']]) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) links_heatmap_hist, xedges, yedges = np.histogram2d(x, y, normed=True, bins=100) links_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] coords = db_worker_view.retrieve_all_links_coords_clicks() print 'coord loaded' links = {} x = [] y = [] values = [] for coord in coords: try: v = links[coord['key']] links[coord['key']]+=1 except: links[coord['key']]=0 for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(coord['page_length']) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) if links[coord['key']]==0: #x.append(x_normed) #y.append(y_normed) values.append(float(coord['counts'])) else: values.append(float(coord['counts'])/float(links[coord['key']])) clicks_heatmap_hist, xedges, yedges = np.histogram2d(x, y, bins=100, normed=True, weights=values) clicks_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] substraction_hist = np.subtract(clicks_heatmap_hist,links_heatmap_hist) #rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist) with np.errstate(divide='ignore', invalid='ignore'): rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist) rel_prob_hist[rel_prob_hist == np.inf] = 0 rel_prob_hist = np.nan_to_num(rel_prob_hist) fig_size = (2.4, 2) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper',norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks-links_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks_over_links_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks-links_heatmap_lognormed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks_over_links_heatmap_lognormed_self_loop.pdf') substraction_hist = np.subtract(links_heatmap_hist, clicks_heatmap_hist) #rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist) with np.errstate(divide='ignore', invalid='ignore'): rel_prob_hist = np.divide(links_heatmap_hist, clicks_heatmap_hist) rel_prob_hist[rel_prob_hist == np.inf] = 0 rel_prob_hist = np.nan_to_num(rel_prob_hist) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links-clicks_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_over_clicks_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links-clicks_heatmap_lognormed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_over_clicks_heatmap_lognormed_self_loop.pdf') print "done"
def multiple_links_heatmap(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_multpile_occ() print 'coord loaded' page_lenghts = db_worker_view.retrieve_all_page_lengths() print 'lenghts loaded' links = {} x = [] y = [] x_conf = [] y_conf = [] x_not_conf = [] y_not_conf = [] number_of_not_confident_clicks=0 number_of_confident_clicks = 0 number_of_valid_normed_links=0 for coord in coords: try: v = links[coord['key']] links[coord['key']]+=1 except: links[coord['key']]=0 for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(page_lenghts[coord['key'][0]]) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) number_of_valid_normed_links+=1 if links[coord['key']]==0: x_conf.append(x_normed) y_conf.append(y_normed) number_of_confident_clicks+=1 else: x_not_conf.append(x_normed) y_not_conf.append(y_normed) number_of_not_confident_clicks+=1 print '###########' print number_of_confident_clicks print number_of_not_confident_clicks print number_of_valid_normed_links print len(coords) print '###########' heatmap, xedges, yedges = np.histogram2d(x_conf, y_conf, bins=100) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] fig_size = (2.4, 2) #fig_size = (3.5, 3) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Log Normalized") plt.show() plt.savefig('output/links_heatmap_lognormed_self_loop_unique.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_heatmap_normed_self_loop_unique.pdf') print "unique done" heatmap, xedges, yedges = np.histogram2d(x_not_conf, y_not_conf, bins=100) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] fig_size = (2.4, 2) #fig_size = (3.5, 3) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Log Normalized") plt.show() plt.savefig('output/links_heatmap_lognormed_self_loop_multiple.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_heatmap_normed_self_loop_multiple.pdf') print "done"
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * __author__ = 'dimitrovdr' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() wikipedia = Graph() for link in db_work_view.retrieve_all_internal_transitions_counts(): for i in range(int(link['counts'])): wikipedia.add_edge(link['from'], link['to']) #print 'from %s, to %s', link['from'], link['to'] #wikipedia.save("output/transitionsnetwork.xml.gz") # filter all nodes that have no edges transitions_network = GraphView( wikipedia, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) transitions_network.save("output/transitionsnetworkweighted.xml.gz") print "Stats for transitions network:" print "number of nodes: %d" % transitions_network.num_vertices() print "number of edges: %d" % transitions_network.num_edges()