def _evaluate_disambiguations(self): INPUT_FILE = self.read_path('Please enter the path of the samples file [.xml]', default='./tmp/samples.xml') LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/evaluation3.log', must_exist=False) CONTINUE = self.read_yes_no('This process might take from several minutes to several hours.\nDo you want to continue?') if not CONTINUE: print '# Aborting...' return print '# Starting evaluation...' # setup logging LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s' logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w') # connecting to db db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) work_view = db.get_work_view() # measure time start = time.clock() evaluator = Evaluator(INPUT_FILE, work_view) result = evaluator.evaluate_disambiguations() seconds = round (time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60) print 'Evaluation done! - precision: %d%%, recall: %d%%' % (round(result['precision']*100), round(result['recall']*100))
def export_data_unresolved(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() connection = db_work_view._db_connection df_clickstream = pn.read_csv( '/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False) df_clickstream['prev'] = df_clickstream['prev'].str.replace('_', ' ') df_clickstream['curr'] = df_clickstream['curr'].str.replace('_', ' ') df_clickstream['curr_unresolved'] = df_clickstream[ 'curr_unresolved'].str.replace('_', ' ') df_redirects_candidates = pn.read_sql( 'select * from redirects_candidates_sample', connection) sample_unresoleved = pn.merge( df_redirects_candidates, df_clickstream, how='left', left_on=['source_article_name', 'target_article_name'], right_on=['prev', 'curr_unresolved']) sample_unresoleved['n'].fillna(0, inplace=True) sample_unresoleved.to_csv( '/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t', encoding="utf-8")
def pickle_vis_data_pandas(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn) print len(df) no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first() print len(no_dup) feature = no_dup.loc[no_dup['visual_region']=='lead'] print len(feature) feature.reset_index(inplace=True) feature = no_dup.loc[no_dup['visual_region']=='infobox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region']=='navbox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region']=='left-body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False) feature = no_dup.loc[no_dup['visual_region']=='body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
def table_parser(self, file_name, root): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_build_view = db.get_build_view() cursor = db_build_view._cursor # setup logging LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s' LOGGING_PATH = 'tmp/tableclasses-dbinsert.log' logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w') html_parser = WikipediaHTMLTableParser() zip_file_path = os.path.join(root, file_name) html = self.zip2html(zip_file_path) html_parser.feed(html.decode('utf-8')) source_article_id = file_name.split('_')[1] try: fed_parser = WikipediaFedTextParser(html_parser.get_data()) table_classes = fed_parser.table_classes(None) table_classes = list(set(table_classes)) for table_class in table_classes: self.insert_table_class(source_article_id, table_class, cursor) except KeyError: db_build_view._db_connection.rollback() logging.error('KeyError FedTextParser source article id: %s ' % source_article_id) db_build_view.commit() db_build_view.reset_cache()
def build_links_position_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute( 'CREATE TABLE `links` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' '`target_article_id` BIGINT UNSIGNED NOT NULL,' ' target_position_in_text INT UNSIGNED NOT NULL,' ' target_position_in_text_only INT UNSIGNED,' ' target_position_in_section INT UNSIGNED,' ' target_position_in_section_in_text_only INT UNSIGNED,' ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' ' section_number INT UNSIGNED,' ' target_position_in_table INT UNSIGNED,' ' table_number INT UNSIGNED,' ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,' ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,' 'INDEX(`target_article_id`),' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def build_links_position_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `redirects_candidates` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' '`target_article_id` BIGINT UNSIGNED NULL,' '`target_article_name` VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' ' target_position_in_text INT UNSIGNED NOT NULL,' ' target_position_in_text_only INT UNSIGNED,' ' target_position_in_section INT UNSIGNED,' ' target_position_in_section_in_text_only INT UNSIGNED,' ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' ' section_number INT UNSIGNED,' ' target_position_in_table INT UNSIGNED,' ' table_number INT UNSIGNED,' ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,' ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,' 'INDEX(`target_article_id`),' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def req(): # Get URLs from a text file, remove white space. db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() articles = db_worker_view.retrieve_all_articles() #articles = db_worker_view.retrieve_all_articles_questionmark() # measure time start = time.clock() start_time_iteration = start iteration_number = 483 for i, article in enumerate(articles): # print some progress if i % 10000 == 0: #print time for the iteration seconds = time.clock() - start_time_iteration m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % (i, h, m, s) start_time_iteration = time.clock() iteration_number += 1 # Thread pool. # Blocks other threads (more than the set limit). pool.acquire(blocking=True) # Create a new thread. # Pass each URL (i.e. u parameter) to the worker function. t = threading.Thread(target=worker, args=(MEDIAWIKI_API_ENDPOINT+urllib.quote(article['title'])+'/'+str(article['rev_id']), article, iteration_number)) # Start the newly create thread. t.start() seconds = time.clock() - start m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Total time: %d:%02d:%02d" % (h, m, s)
def run(self): self.print_title('This is the interactive runner program') self.create_tmp_if_not_exists() INPUT_FILE = self.read_path('Please enter the path of the input file [.txt]', default='./tmp/input.txt') OUTPUT_FILE = self.read_path('Please enter the path of the output file [.html]', default='./tmp/output.html', must_exist=False) LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/runner.log', must_exist=False) print '# Starting runner...' # setup logging LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s' logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w') # measure time start = time.clock() # connect to db db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) work_view = db.get_work_view() # read input f = open(INPUT_FILE, 'r') text = f.read() text = text.replace(' ', ' ') f.close() # create dummy article article = {} article['type'] = 'article' article['id'] = None article['title'] = None article['text'] = text article['links'] = [] # identify links link_detector = LinkDetector(work_view) link_detector.detect_links(article) # identify terms #term_identifier = TermIdentifier() #article = term_identifier.identify_terms(text) # find possible meanings meaning_finder = MeaningFinder(work_view) meaning_finder.find_meanings(article) # calculate relatedness relatedness_calculator = RelatednessCalculator(work_view) # decide for meaning decider = Decider(relatedness_calculator) decider.decide(article) # output results html_outputter = HTMLOutputter() html_outputter.output(article, OUTPUT_FILE) seconds = round (time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
def pickle_correlations_zeros(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn) print 'group' article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index() print 'write to file' article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)
def __init__(self, path): #os.environ["DISPLAY"]=":1" print path os.environ["DISPLAY"]=":1" db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) self.db_build_view = db.get_build_view() self.cursor = self.db_build_view._cursor self.app = QApplication(sys.argv) self.path = path
def __init__(self, path): #os.environ["DISPLAY"]=":1" print path os.environ["DISPLAY"] = ":1" db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) self.db_build_view = db.get_build_view() self.cursor = self.db_build_view._cursor self.app = QApplication(sys.argv) self.path = path
def pickle_category_counts_distribution(): results = {} db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']: try: cursor.execute('select counts from link_features where counts is not null and visual_region=%s;', (category,)) result = cursor.fetchall() results[category] = result except MySQLdb.Error, e: print e
def build_page_length_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `page_length` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,' ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL' ') ENGINE=InnoDB;') connection.close()
def build_page_length_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `redirects_candidates_page_length` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,' ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL' ') ENGINE=InnoDB;') connection.close()
def _create_structure(self): # measure time start = time.clock() # creating structure db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db.build() seconds = round (time.clock() - start) logging.info('Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
def pickle_aggregated_counts_distribution(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor results = {} try: cursor.execute('select sum(counts) from clickstream_derived_internal_links group by prev_id;') result = cursor.fetchall() results['source_article']=result except MySQLdb.Error, e: print e
def pickle_vis_data_pandas(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql( 'select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn) print len(df) no_dup = df.sort([ 'source_article_id', 'target_y_coord_1920_1080', 'target_x_coord_1920_1080' ]).groupby(["source_article_id", "target_article_id"]).first() print len(no_dup) feature = no_dup.loc[no_dup['visual_region'] == 'lead'] print len(feature) feature.reset_index(inplace=True) feature = no_dup.loc[no_dup['visual_region'] == 'infobox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region'] == 'navbox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region'] == 'left-body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region'] == 'body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t', index=False)
def plot_degree_filtered_sql(): print 'before select' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute( 'SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in ' ' (select distinct prev_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'result len' print len(result) for i, link in enumerate(result): if i % 1000000 == 0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) print 'before save' network.save( "output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz") print 'done' cursor.execute( 'SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in ' ' (select distinct curr_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'resutl len' print len(result) for i, link in enumerate(result): if i % 1000000 == 0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) print 'before save' network.save( "output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz") print 'done'
def links_heatmap(): #http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set # Get URLs from a text file, remove white space. print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_coords() print 'coord loaded' x=[] y=[] page_lenghts = db_worker_view.retrieve_all_page_lengths() print 'lenghts loaded' for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']]) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) heatmap, xedges, yedges = np.histogram2d(x, y, bins=100) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] fig_size = (2.4, 2) #fig_size = (3.5, 3) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Log Normalized") plt.show() plt.savefig('output/links_heatmap_lognormed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_heatmap_normed_self_loop.pdf') print "done"
def correlations(network_name): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() # wikipedia graph structural statistics results = None try: results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() except MySQLdb.Error, e: print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
def _create_structure(self): # measure time start = time.clock() # creating structure db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db.build() seconds = round(time.clock() - start) logging.info('Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
def build_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `table_css_class` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def pickle_category_counts_distribution(): results = {} db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']: try: cursor.execute( 'select counts from link_features where counts is not null and visual_region=%s;', (category, )) result = cursor.fetchall() results[category] = result except MySQLdb.Error, e: print e
def pickle_redirects_ids(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() redirects_list_id = [] with open(HOME+"data/candidate_articles.tsv") as f: next(f) for line in f: line = line.strip().split('\t') #look up id tmp = db_work_view.resolve_title(line[0].replace('_',' ')) #print tmp if tmp is not None: redirects_list_id.append(tmp['id']) pickle.dump(redirects_list_id, open(SSD_HOME+"pickle/redirects_ids.obj", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
def pickle_aggregated_counts_distribution(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() cursor = db_worker_view._cursor results = {} try: cursor.execute( 'select sum(counts) from clickstream_derived_internal_links group by prev_id;' ) result = cursor.fetchall() results['source_article'] = result except MySQLdb.Error, e: print e
def build_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute( 'CREATE TABLE `table_css_class` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def pickle_correlations_zeros_january(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id from link_features', conn) print 'loaded links' df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn) print 'loaded counts' result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id']) print 'merged counts' print result article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index() article_counts['counts'].fillna(0.0, inplace=True) print article_counts print 'write to file' article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
def clicks_heatmap_total(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_coords_clicks() print 'coord loaded' links = {} x = [] y = [] values = [] for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(coord['page_length']) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) values.append(float(coord['counts'])) heatmap, xedges, yedges = np.histogram2d(x, y, bins=100, weights=values) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0] ] fig_size = (2.4, 2) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Clicks Heatmap Log Normalized") plt.show() plt.savefig('output/clicks_heatmap_lognormed_self_loop_total.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Clicks Heatmap Normalized") plt.show() plt.savefig('output/clicks_heatmap_normed_self_loop_total.pdf') print "done"
def rbo(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() sm = [] try: cursor.execute('select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm",)) result = cursor.fetchall() for row in result: record = {} record['curr_id']= row[0] record['counts_sum'] = row[1] record['curr_title'] = row[2] sm.append(row[0]) except MySQLdb.Error, e: print e
def _extract_articles(self): INPUT_FILE = WIKI_DUMP_XML_FILE #self.read_path('Please enter the path of the wiki dump file [.xml]') #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]') MAX_ARTICLES_IN_QUEUE = 200 #self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000) NUM_THREADS = 1 #self.read_number('How many threads shall be used to write to the database?', 20, 1, 50) CONTINUE = True #self.read_yes_no('This process might take several days to finish.\nDo you want to continue?') if CONTINUE: # measure time start = time.clock() # connect to database and create article queue db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE) # create reader and threads reader = WikipediaReader(INPUT_FILE, queue, extract_text=False) threads = [] for i in range(0, NUM_THREADS): inserter = ArticleInserter(queue, db.get_build_view()) threads.append(inserter) # start reader reader.start() # start insert threads for thread in threads: thread.start() # wait for reading thread, queue and inserters to be done reader.join() queue.join() for thread in threads: thread.end() for thread in threads: thread.join() seconds = round(time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60) else: print 'Aborting...'
def pickle_redirects_ids(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() redirects_list_id = [] with open(HOME + "data/candidate_articles.tsv") as f: next(f) for line in f: line = line.strip().split('\t') #look up id tmp = db_work_view.resolve_title(line[0].replace('_', ' ')) #print tmp if tmp is not None: redirects_list_id.append(tmp['id']) pickle.dump(redirects_list_id, open(SSD_HOME + "pickle/redirects_ids.obj", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
def plot_degree_filtered_sql(): print 'before select' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in ' ' (select distinct prev_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'result len' print len(result) for i, link in enumerate(result): if i % 1000000==0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print 'before save' network.save("output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz") print 'done' cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in ' ' (select distinct curr_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'resutl len' print len(result) for i, link in enumerate(result): if i % 1000000==0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print 'before save' network.save("output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz") print 'done'
def _extract_articles(self): INPUT_FILE = WIKI_DUMP_XML_FILE #self.read_path('Please enter the path of the wiki dump file [.xml]') #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]') MAX_ARTICLES_IN_QUEUE = 200#self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000) NUM_THREADS = 1#self.read_number('How many threads shall be used to write to the database?', 20, 1, 50) CONTINUE = True#self.read_yes_no('This process might take several days to finish.\nDo you want to continue?') if CONTINUE: # measure time start = time.clock() # connect to database and create article queue db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE) # create reader and threads reader = WikipediaReader(INPUT_FILE, queue, extract_text=False) threads = [] for i in range(0, NUM_THREADS): inserter = ArticleInserter(queue, db.get_build_view()) threads.append(inserter) # start reader reader.start() # start insert threads for thread in threads: thread.start() # wait for reading thread, queue and inserters to be done reader.join() queue.join() for thread in threads: thread.end() for thread in threads: thread.join() seconds = round (time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60) else: print 'Aborting...'
def rbo(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() sm = [] try: cursor.execute( 'select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm", )) result = cursor.fetchall() for row in result: record = {} record['curr_id'] = row[0] record['counts_sum'] = row[1] record['curr_title'] = row[2] sm.append(row[0]) except MySQLdb.Error, e: print e
def export_data_unresolved(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() connection = db_work_view._db_connection df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False) df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ') df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ') df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ') df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection) sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved']) sample_unresoleved['n'].fillna(0, inplace=True) sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
def req(): # Get URLs from a text file, remove white space. db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() articles = db_worker_view.retrieve_all_articles() #articles = db_worker_view.retrieve_all_articles_questionmark() # measure time start = time.clock() start_time_iteration = start iteration_number = 483 for i, article in enumerate(articles): # print some progress if i % 10000 == 0: #print time for the iteration seconds = time.clock() - start_time_iteration m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % ( i, h, m, s) start_time_iteration = time.clock() iteration_number += 1 # Thread pool. # Blocks other threads (more than the set limit). pool.acquire(blocking=True) # Create a new thread. # Pass each URL (i.e. u parameter) to the worker function. t = threading.Thread( target=worker, args=(MEDIAWIKI_API_ENDPOINT + urllib.quote(article['title']) + '/' + str(article['rev_id']), article, iteration_number)) # Start the newly create thread. t.start() seconds = time.clock() - start m, s = divmod(seconds, 60) h, m = divmod(m, 60) print "Total time: %d:%02d:%02d" % (h, m, s)
def correlations_zeros(labels, consider_zeros=True, clickstream_data='', struct=False): #load network print struct name = '_'.join(labels) wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") #read counts with zeros if consider_zeros: article_counts = pd.read_csv(TMP+clickstream_data+'article_counts.tsv', sep='\t') print TMP+clickstream_data+'article_counts.tsv' correlations_weighted_pagerank = {} for label in labels: if struct: label = label[7:] for damping in [0.8,0.85,0.9]: key = label+"_page_rank_weighted_"+str(damping) pagerank = wikipedia.vertex_properties[key] page_rank_values = list() counts = list() correlations_values = {} for index, row in article_counts.iterrows(): counts.append(float(row['counts'])) page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations_values['pearson']=p print 'spearmanr' s = spearmanr(page_rank_values, counts) print s correlations_values['spearmanr']=s print 'kendalltau' k = kendalltau(page_rank_values, counts) print k correlations_values['kendalltau']=k correlations_weighted_pagerank[key]=correlations_values write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_'+name+'.obj', correlations_weighted_pagerank) else: db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() # wikipedia graph structural statistics results = None try: if clickstream_data != '': results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() else: results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived_en_201501 c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() except MySQLdb.Error, e: print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0])) print 'after sql load' correlations_weighted_pagerank = {} for label in labels: if struct: label = label[7:] for damping in [0.8,0.85,0.9]: key = label+"_page_rank_weighted_"+str(damping) pagerank = wikipedia.vertex_properties[key] correlations={} counts=[] page_rank_values=[] for row in results: counts.append(float(row[1])) page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations['pearson']=p print 'spearmanr' s= spearmanr(page_rank_values, counts) print s correlations['spearmanr']=s print 'kendalltau' k= kendalltau(page_rank_values, counts) print k correlations['kendalltau']=k correlations_weighted_pagerank[key]=correlations write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_without_zeros'+name+'.obj', correlations_weighted_pagerank)
def print_table(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql('select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, ' 'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, ' 'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080', conn) print "dup" #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first() no_dup = df.groupby(["source_article_id", "target_article_id"]).first() no_dup = no_dup.reset_index() print "no dup" del df #print no_dup df_top = pd.read_sql("select source_article_id, target_article_id, sim as topic_similarity from topic_similarity", conn) print "no up" topDF = df_top.groupby("source_article_id", as_index=False)["topic_similarity"].median() #print topDF print "no up1" topDF.columns = ["source_article_id", "topic_similarity_article_median"] #print topDF print "no up2" df_top = df_top.merge(topDF, on="source_article_id") #print df_top[(df_top['topic_similarity_article_median'] >0)] print "no up3" df_sem = pd.read_sql("select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity", conn) print "no up4" semDF = df_sem.groupby("source_article_id", as_index=False)["sem_similarity"].median() #rename print "no up5" semDF.columns = ["source_article_id", "sem_similarity_article_median"] print "no up6" #print df_top df_sem = df_sem.merge(semDF, on="source_article_id") #print len(df_sem) print "no up7" df1 = no_dup.merge(df_sem[['source_article_id', 'sem_similarity', 'sem_similarity_article_median']], on="source_article_id") #print no_dup del df_sem, semDF df = no_dup.merge(df_top[['source_article_id', 'topic_similarity', 'topic_similarity_article_median']], on="source_article_id") print "no up9" del no_dup del df_top, topDF table = "" table += resultTableLine (df, "src_degr > target_degr", "df.rel_degree > 0") table += resultTableLine (df, "src_degr <= target_degr", "df.rel_degree <= 0") table += resultTableLine (df, "src_in_degr > target_in_degr", "df.rel_in_degree > 0") table += resultTableLine (df, "src_in_degr <= target_in_degr", "df.rel_in_degree <= 0") table += resultTableLine (df, "src_out_degr > target_out_degr", "df.rel_out_degree > 0") table += resultTableLine (df, "src_out_degr <= target_out_degr", "df.rel_out_degree <= 0") table += resultTableLine (df, "src_kcore > target_kcore", "df.rel_kcore > 0") table += resultTableLine (df, "src_kcore <= target_kcore", "df.rel_kcore <= 0") table += resultTableLine (df, "src_page_rank > target_page_rank", "df.rel_page_rank > 0") table += resultTableLine (df, "src_page_rank <= target_page_rank", "df.rel_page_rank <= 0") table += resultTableLine (df1, "text_sim > median(text_sim) of page", "df.sem_similarity > df.sem_similarity_article_median") table += resultTableLine (df1, "text_sim <= median(text_sim) of page", "df.sem_similarity <= df.sem_similarity_article_median") table += resultTableLine (df, "topic_sim > median(topic_sim) of page", "df.topic_similarity > df.topic_similarity_article_median") table += resultTableLine (df, "topic_sim <= median(topic_sim) of page", "df.topic_similarity <= df.topic_similarity_article_median") table += resultTableLine (df, "left third of screen", "df.target_x_coord_1920_1080 <= 360") table += resultTableLine (df, "middle third of screen", "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)") table += resultTableLine (df, "right third of screen", "df.target_x_coord_1920_1080 > 720") table += resultTableLine (df, "position = lead", "df.visual_region == 'lead'") table += resultTableLine (df, "position = body", "(df.visual_region == 'body') | (df.visual_region == 'left-body')") table += resultTableLine (df, "position = navbox", "df.visual_region == 'navbox'") #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'") table += resultTableLine (df, "position = infobox", "df.visual_region == 'infobox'") print table
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute( 'SELECT source_article_id, target_article_id FROM link_occurences;') result = cursor.fetchall() wikipedia = Graph() for link in result: wikipedia.add_edge(link[0], link[1]) # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) print "clust" wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia) print "page_rank" wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia) print "eigenvector_centr" eigenvalue, eigenvectorcentr = eigenvector(wikipedia) wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr print "kcore"
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences;') result = cursor.fetchall() wikipedia = Graph() for link in result: wikipedia.add_edge(link[0], link[1]) # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "clust" wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia) print "page_rank" wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia) print "eigenvector_centr" eigenvalue, eigenvectorcentr = eigenvector(wikipedia) wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr print "kcore" wikipedia.vertex_properties["kcore"] = kcore_decomposition(wikipedia)
def links_heatmap_rel_prob(): #http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set # Get URLs from a text file, remove white space. print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_coords() x=[] y=[] page_lenghts = db_worker_view.retrieve_all_page_lengths() for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']]) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) links_heatmap_hist, xedges, yedges = np.histogram2d(x, y, normed=True, bins=100) links_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] coords = db_worker_view.retrieve_all_links_coords_clicks() print 'coord loaded' links = {} x = [] y = [] values = [] for coord in coords: try: v = links[coord['key']] links[coord['key']]+=1 except: links[coord['key']]=0 for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(coord['page_length']) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) if links[coord['key']]==0: #x.append(x_normed) #y.append(y_normed) values.append(float(coord['counts'])) else: values.append(float(coord['counts'])/float(links[coord['key']])) clicks_heatmap_hist, xedges, yedges = np.histogram2d(x, y, bins=100, normed=True, weights=values) clicks_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] substraction_hist = np.subtract(clicks_heatmap_hist,links_heatmap_hist) #rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist) with np.errstate(divide='ignore', invalid='ignore'): rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist) rel_prob_hist[rel_prob_hist == np.inf] = 0 rel_prob_hist = np.nan_to_num(rel_prob_hist) fig_size = (2.4, 2) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper',norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks-links_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks_over_links_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks-links_heatmap_lognormed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() plt.show() plt.savefig('output/clicks_over_links_heatmap_lognormed_self_loop.pdf') substraction_hist = np.subtract(links_heatmap_hist, clicks_heatmap_hist) #rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist) with np.errstate(divide='ignore', invalid='ignore'): rel_prob_hist = np.divide(links_heatmap_hist, clicks_heatmap_hist) rel_prob_hist[rel_prob_hist == np.inf] = 0 rel_prob_hist = np.nan_to_num(rel_prob_hist) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links-clicks_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_over_clicks_heatmap_normed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links-clicks_heatmap_lognormed_self_loop.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_over_clicks_heatmap_lognormed_self_loop.pdf') print "done"
def multiple_links_heatmap(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_worker_view = db.get_work_view() coords = db_worker_view.retrieve_all_links_multpile_occ() print 'coord loaded' page_lenghts = db_worker_view.retrieve_all_page_lengths() print 'lenghts loaded' links = {} x = [] y = [] x_conf = [] y_conf = [] x_not_conf = [] y_not_conf = [] number_of_not_confident_clicks=0 number_of_confident_clicks = 0 number_of_valid_normed_links=0 for coord in coords: try: v = links[coord['key']] links[coord['key']]+=1 except: links[coord['key']]=0 for coord in coords: x_normed = float(coord['x'])/float(1920) y_normed = float(coord['y'])/float(page_lenghts[coord['key'][0]]) if x_normed <=1.0 and y_normed <=1.0: x.append(x_normed) y.append(y_normed) number_of_valid_normed_links+=1 if links[coord['key']]==0: x_conf.append(x_normed) y_conf.append(y_normed) number_of_confident_clicks+=1 else: x_not_conf.append(x_normed) y_not_conf.append(y_normed) number_of_not_confident_clicks+=1 print '###########' print number_of_confident_clicks print number_of_not_confident_clicks print number_of_valid_normed_links print len(coords) print '###########' heatmap, xedges, yedges = np.histogram2d(x_conf, y_conf, bins=100) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] fig_size = (2.4, 2) #fig_size = (3.5, 3) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Log Normalized") plt.show() plt.savefig('output/links_heatmap_lognormed_self_loop_unique.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_heatmap_normed_self_loop_unique.pdf') print "unique done" heatmap, xedges, yedges = np.histogram2d(x_not_conf, y_not_conf, bins=100) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] fig_size = (2.4, 2) #fig_size = (3.5, 3) plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Log Normalized") plt.show() plt.savefig('output/links_heatmap_lognormed_self_loop_multiple.pdf') plt.clf() plt.figure(figsize=fig_size) plt.grid(True) plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet')) plt.colorbar() #plt.title("Links Heatmap Normalized") plt.show() plt.savefig('output/links_heatmap_normed_self_loop_multiple.pdf') print "done"
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * __author__ = 'dimitrovdr' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() wikipedia = Graph() for link in db_work_view.retrieve_all_internal_transitions_counts(): for i in range(int(link['counts'])): wikipedia.add_edge(link['from'], link['to']) #print 'from %s, to %s', link['from'], link['to'] #wikipedia.save("output/transitionsnetwork.xml.gz") # filter all nodes that have no edges transitions_network = GraphView( wikipedia, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) transitions_network.save("output/transitionsnetworkweighted.xml.gz") print "Stats for transitions network:" print "number of nodes: %d" % transitions_network.num_vertices() print "number of edges: %d" % transitions_network.num_edges()
def print_table(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql( 'select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, ' 'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, ' 'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080', conn) print "dup" #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first() no_dup = df.groupby(["source_article_id", "target_article_id"]).first() no_dup = no_dup.reset_index() print "no dup" del df #print no_dup df_top = pd.read_sql( "select source_article_id, target_article_id, sim as topic_similarity from topic_similarity", conn) print "no up" topDF = df_top.groupby("source_article_id", as_index=False)["topic_similarity"].median() #print topDF print "no up1" topDF.columns = ["source_article_id", "topic_similarity_article_median"] #print topDF print "no up2" df_top = df_top.merge(topDF, on="source_article_id") #print df_top[(df_top['topic_similarity_article_median'] >0)] print "no up3" df_sem = pd.read_sql( "select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity", conn) print "no up4" semDF = df_sem.groupby("source_article_id", as_index=False)["sem_similarity"].median() #rename print "no up5" semDF.columns = ["source_article_id", "sem_similarity_article_median"] print "no up6" #print df_top df_sem = df_sem.merge(semDF, on="source_article_id") #print len(df_sem) print "no up7" df1 = no_dup.merge(df_sem[[ 'source_article_id', 'sem_similarity', 'sem_similarity_article_median' ]], on="source_article_id") #print no_dup del df_sem, semDF df = no_dup.merge(df_top[[ 'source_article_id', 'topic_similarity', 'topic_similarity_article_median' ]], on="source_article_id") print "no up9" del no_dup del df_top, topDF table = "" table += resultTableLine(df, "src_degr > target_degr", "df.rel_degree > 0") table += resultTableLine(df, "src_degr <= target_degr", "df.rel_degree <= 0") table += resultTableLine(df, "src_in_degr > target_in_degr", "df.rel_in_degree > 0") table += resultTableLine(df, "src_in_degr <= target_in_degr", "df.rel_in_degree <= 0") table += resultTableLine(df, "src_out_degr > target_out_degr", "df.rel_out_degree > 0") table += resultTableLine(df, "src_out_degr <= target_out_degr", "df.rel_out_degree <= 0") table += resultTableLine(df, "src_kcore > target_kcore", "df.rel_kcore > 0") table += resultTableLine(df, "src_kcore <= target_kcore", "df.rel_kcore <= 0") table += resultTableLine(df, "src_page_rank > target_page_rank", "df.rel_page_rank > 0") table += resultTableLine(df, "src_page_rank <= target_page_rank", "df.rel_page_rank <= 0") table += resultTableLine( df1, "text_sim > median(text_sim) of page", "df.sem_similarity > df.sem_similarity_article_median") table += resultTableLine( df1, "text_sim <= median(text_sim) of page", "df.sem_similarity <= df.sem_similarity_article_median") table += resultTableLine( df, "topic_sim > median(topic_sim) of page", "df.topic_similarity > df.topic_similarity_article_median") table += resultTableLine( df, "topic_sim <= median(topic_sim) of page", "df.topic_similarity <= df.topic_similarity_article_median") table += resultTableLine(df, "left third of screen", "df.target_x_coord_1920_1080 <= 360") table += resultTableLine( df, "middle third of screen", "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)" ) table += resultTableLine(df, "right third of screen", "df.target_x_coord_1920_1080 > 720") table += resultTableLine(df, "position = lead", "df.visual_region == 'lead'") table += resultTableLine( df, "position = body", "(df.visual_region == 'body') | (df.visual_region == 'left-body')") table += resultTableLine(df, "position = navbox", "df.visual_region == 'navbox'") #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'") table += resultTableLine(df, "position = infobox", "df.visual_region == 'infobox'") print table
def weighted_pagerank_hyp_engineering(labels): #read vocab, graph graph = read_pickle(SSD_HOME+"pickle/graph") print "loaded graph" values = read_pickle(SSD_HOME+"pickle/values") values_kcore = read_pickle(SSD_HOME+"pickle/values_kcore") # transform kcore values to model going out of the kcore values_kcore = [1./np.sqrt(float(x)) for x in values_kcore] print 'kcore values tranfsormation' #sem_sim_hyp = read_pickle(SSD_HOME+"pickle/sem_sim_hyp") #print "sem_sim_hyp values" #lead_hyp = read_pickle(SSD_HOME+"pickle/lead_hyp") #infobox_hyp = read_pickle(SSD_HOME+"pickle/infobox_hyp") #left_body_hyp = read_pickle(SSD_HOME+"pickle/left-body_hyp") #print "gamma values" vocab = read_pickle(SSD_HOME+"pickle/vocab") print "loaded vocab" state_count = len(vocab) states = vocab.keys() shape = (state_count, state_count) hyp_structural = csr_matrix((values, (graph[0], graph[1])), shape=shape, dtype=np.float) hyp_kcore = csr_matrix((values_kcore, (graph[0], graph[1])), shape=shape, dtype=np.float) print "hyp_kcore" del graph del values_kcore print "after delete" #read sem sim form db and create hyp db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id, sim from semantic_similarity', conn) print 'map sem sim' sem_sim_hyp_i = map_to_hyp_indicies(vocab, df['source_article_id']) sem_sim_hyp_j = map_to_hyp_indicies(vocab, df['target_article_id']) hyp_sem_sim = csr_matrix((df['sim'].values, (sem_sim_hyp_i, sem_sim_hyp_j)), shape=shape, dtype=np.float) print 'done map sem sim' print hyp_sem_sim.shape del sem_sim_hyp_i del sem_sim_hyp_j del df #read vis form csv and create hyp lead = pd.read_csv(TMP+'lead.tsv',sep='\t') lead_i = map_to_hyp_indicies(vocab, lead['source_article_id']) lead_j = map_to_hyp_indicies(vocab, lead['target_article_id']) lead_v = np.ones(len(lead_i), dtype=np.float) hyp_lead = csr_matrix((lead_v, (lead_i, lead_j)), shape=shape, dtype=np.float) print 'done map lead' print hyp_lead.shape del lead del lead_i del lead_j del lead_v infobox = pd.read_csv(TMP+'infobox.tsv',sep='\t') infobox_i = map_to_hyp_indicies(vocab, infobox['source_article_id']) infobox_j = map_to_hyp_indicies(vocab, infobox['target_article_id']) infobox_v = np.ones(len(infobox_i), dtype=np.float) hyp_infobox = csr_matrix((infobox_v, (infobox_i, infobox_j)), shape=shape, dtype=np.float) print 'done map infobox' print hyp_infobox.shape del infobox del infobox_i del infobox_j del infobox_v left_body = pd.read_csv(TMP+'left-body.tsv',sep='\t') left_body_i = map_to_hyp_indicies(vocab, left_body['source_article_id']) left_body_j = map_to_hyp_indicies(vocab, left_body['target_article_id']) left_body_v = np.ones(len(left_body_i), dtype=np.float) hyp_left_body = csr_matrix((left_body_v, (left_body_i, left_body_j)), shape=shape, dtype=np.float) print 'done map infobox' print hyp_left_body.shape del left_body del left_body_i del left_body_j del left_body_v #add the visual hyps to one matrix and set all non zero fields to 1.0 print 'before gamma' hyp_gamma = hyp_left_body + hyp_infobox + hyp_lead hyp_gamma.data = np.ones_like(hyp_gamma.data, dtype=np.float) print 'after gamma' del hyp_left_body del hyp_infobox del hyp_lead #norm print "in norm each " hyp_structural = norm(hyp_structural) hyp_kcore = norm(hyp_kcore) hyp_sem_sim = norm(hyp_sem_sim) hyp_gamma = norm(hyp_gamma) #engineering of hypos and norm again hyp_kcore_struct = norm(hyp_structural + hyp_kcore) hyp_visual_struct = norm(hyp_structural + hyp_gamma) hyp_sem_sim_struct = norm(hyp_structural + hyp_sem_sim) hyp_mix_semsim_kcore = norm(hyp_kcore + hyp_sem_sim) hyp_mix_semsim_visual = norm(hyp_sem_sim + hyp_gamma) hyp_mix_kcore_visual= norm(hyp_kcore + hyp_gamma) hyp_all = norm(hyp_kcore + hyp_sem_sim + hyp_gamma) hyp_all_struct = norm(hyp_kcore + hyp_sem_sim + hyp_gamma + hyp_structural) hyp_semsim_struct = norm(hyp_structural + hyp_kcore) print 'test hypos' hypos={} hypos['hyp_kcore']=hyp_kcore hypos['hyp_sem_sim']=hyp_sem_sim hypos['hyp_visual']=hyp_gamma hypos['hyp_kcore_struct']=hyp_kcore_struct hypos['hyp_visual_struct']=hyp_visual_struct hypos['hyp_sem_sim_struct']=hyp_sem_sim_struct hypos['hyp_mix_semsim_kcore']=hyp_mix_semsim_kcore hypos['hyp_mix_semsim_visual']=hyp_mix_semsim_visual hypos['hyp_mix_kcore_visual']=hyp_mix_kcore_visual hypos['hyp_all']=hyp_all hypos['hyp_all_struct']=hyp_all_struct #load network print "weighted page rank engineering" wikipedia = load_graph("output/wikipedianetwork.xml.gz") #for label, hyp in hypos.iteritems(): name = '_'.join(labels) for label in labels: print label eprop = create_eprop(wikipedia, hypos[label], vocab) wikipedia.edge_properties[label]=eprop #for damping in [0.8, 0.85, 0.9 ,0.95]: for damping in [0.85]: key = label+"_page_rank_weighted_"+str(damping) print key wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop, damping=damping) print 'save network' wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") print 'save network' wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") print 'done'
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * __author__ = 'dimitrovdr' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() wikipedia = Graph() for link in db_work_view.retrieve_all_internal_transitions(): wikipedia.add_edge(link['from'], link['to']) #print 'from %s, to %s', link['from'], link['to'] #wikipedia.save("output/transitionsnetwork.xml.gz") # filter all nodes that have no edges transitions_network = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "clust" transitions_network.vertex_properties["local_clust"] = local_clustering(transitions_network) print "page_rank" transitions_network.vertex_properties["page_rank"] = pagerank(transitions_network) print "eigenvector_centr" eigenvalue, eigenvectorcentr = eigenvector(transitions_network) transitions_network.vertex_properties["eigenvector_centr"] = eigenvectorcentr
def weighted_pagerank(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute('SELECT source_article_id, target_article_id, occ FROM link_occurences;') result = cursor.fetchall() wikipedia = Graph() eprop = wikipedia.new_edge_property("int") for link in result: e = wikipedia.add_edge(link[0], link[1]) eprop[e] = link[2] # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "page_rank_weighted" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank_weighted"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping) print "page_rank" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping) wikipedia.save("output/weightedpagerank/wikipedianetwork_link_occ.xml.gz") print 'link_occ done' cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity group by ' 'source_article_id, target_article_id;') result = cursor.fetchall() wikipedia = Graph() eprop = wikipedia.new_edge_property("double") for link in result: e = wikipedia.add_edge(link[0], link[1]) eprop[e] = link[2] # filter all nodes that have no edges print 'filter nodes graph tool specific code' wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "page_rank_weighted" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank_weighted"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping) print "page_rank" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping) wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz") print 'sem sim distrinct links done' cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity;') result = cursor.fetchall() wikipedia = Graph() eprop = wikipedia.new_edge_property("double") for link in result: e = wikipedia.add_edge(link[0], link[1]) eprop[e] = link[2] # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "page_rank_weighted" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank_weighted"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping) print "page_rank" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping) wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim.xml.gz") print 'sem_sim done'