Example #1
0
    def _evaluate_disambiguations(self):
        INPUT_FILE = self.read_path('Please enter the path of the samples file [.xml]', default='./tmp/samples.xml')
        LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/evaluation3.log', must_exist=False)
        
        CONTINUE = self.read_yes_no('This process might take from several minutes to several hours.\nDo you want to continue?')

        if not CONTINUE:
            print '# Aborting...'
            return

        print '# Starting evaluation...'
        # setup logging
        LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
        logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')

        # connecting to db
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        work_view = db.get_work_view()

        # measure time
        start = time.clock()

        evaluator = Evaluator(INPUT_FILE, work_view)
        result = evaluator.evaluate_disambiguations()

        seconds = round (time.clock() - start)
        print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
        print 'Evaluation done! - precision: %d%%, recall: %d%%' % (round(result['precision']*100), round(result['recall']*100))
Example #2
0
def export_data_unresolved():

    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    db_work_view = db.get_work_view()
    connection = db_work_view._db_connection

    df_clickstream = pn.read_csv(
        '/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv',
        sep='\t',
        error_bad_lines=False)

    df_clickstream['prev'] = df_clickstream['prev'].str.replace('_', ' ')
    df_clickstream['curr'] = df_clickstream['curr'].str.replace('_', ' ')
    df_clickstream['curr_unresolved'] = df_clickstream[
        'curr_unresolved'].str.replace('_', ' ')

    df_redirects_candidates = pn.read_sql(
        'select * from redirects_candidates_sample', connection)

    sample_unresoleved = pn.merge(
        df_redirects_candidates,
        df_clickstream,
        how='left',
        left_on=['source_article_name', 'target_article_name'],
        right_on=['prev', 'curr_unresolved'])

    sample_unresoleved['n'].fillna(0, inplace=True)
    sample_unresoleved.to_csv(
        '/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv',
        sep='\t',
        encoding="utf-8")
Example #3
0
def pickle_vis_data_pandas():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()


    df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn)
    print len(df)

    no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    print len(no_dup)

    feature = no_dup.loc[no_dup['visual_region']=='lead']
    print len(feature)
    feature.reset_index(inplace=True)


    feature = no_dup.loc[no_dup['visual_region']=='infobox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='navbox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='left-body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False)

    feature = no_dup.loc[no_dup['visual_region']=='body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
Example #4
0
    def table_parser(self, file_name, root):

        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        db_build_view = db.get_build_view()

        cursor = db_build_view._cursor

        # setup logging
        LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
        LOGGING_PATH = 'tmp/tableclasses-dbinsert.log'
        logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')

        html_parser = WikipediaHTMLTableParser()
        zip_file_path = os.path.join(root, file_name)
        html = self.zip2html(zip_file_path)
        html_parser.feed(html.decode('utf-8'))
        source_article_id = file_name.split('_')[1]
        try:
            fed_parser = WikipediaFedTextParser(html_parser.get_data())
            table_classes = fed_parser.table_classes(None)
            table_classes = list(set(table_classes))
            for table_class in table_classes:
               self.insert_table_class(source_article_id, table_class, cursor)
        except KeyError:
            db_build_view._db_connection.rollback()
            logging.error('KeyError FedTextParser source article id: %s ' % source_article_id)
        db_build_view.commit()
        db_build_view.reset_cache()
Example #5
0
def build_links_position_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute(
        'CREATE TABLE `links` ('
        '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
        '`source_article_id` BIGINT UNSIGNED NOT NULL,'
        '`target_article_id` BIGINT UNSIGNED NOT NULL,'
        ' target_position_in_text INT UNSIGNED NOT NULL,'
        ' target_position_in_text_only INT UNSIGNED,'
        ' target_position_in_section INT UNSIGNED,'
        ' target_position_in_section_in_text_only INT UNSIGNED,'
        ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
        ' section_number INT UNSIGNED,'
        ' target_position_in_table INT UNSIGNED,'
        ' table_number INT UNSIGNED,'
        ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
        ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
        ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,'
        ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,'
        'INDEX(`target_article_id`),'
        'INDEX(`source_article_id`)'
        ') ENGINE=InnoDB;')
    connection.close()
Example #6
0
    def table_parser(self, file_name, root):

        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                           DATABASE_NAME)
        db_build_view = db.get_build_view()

        cursor = db_build_view._cursor

        # setup logging
        LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
        LOGGING_PATH = 'tmp/tableclasses-dbinsert.log'
        logging.basicConfig(filename=LOGGING_PATH,
                            level=logging.DEBUG,
                            format=LOGGING_FORMAT,
                            filemode='w')

        html_parser = WikipediaHTMLTableParser()
        zip_file_path = os.path.join(root, file_name)
        html = self.zip2html(zip_file_path)
        html_parser.feed(html.decode('utf-8'))
        source_article_id = file_name.split('_')[1]
        try:
            fed_parser = WikipediaFedTextParser(html_parser.get_data())
            table_classes = fed_parser.table_classes(None)
            table_classes = list(set(table_classes))
            for table_class in table_classes:
                self.insert_table_class(source_article_id, table_class, cursor)
        except KeyError:
            db_build_view._db_connection.rollback()
            logging.error('KeyError FedTextParser source article id: %s ' %
                          source_article_id)
        db_build_view.commit()
        db_build_view.reset_cache()
def build_links_position_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `redirects_candidates` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
                      '`source_article_id` BIGINT UNSIGNED NOT NULL,'
                      '`target_article_id` BIGINT UNSIGNED NULL,'
                      '`target_article_name` VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      ' target_position_in_text INT UNSIGNED NOT NULL,'
                      ' target_position_in_text_only INT UNSIGNED,'
                      ' target_position_in_section INT UNSIGNED,'
                      ' target_position_in_section_in_text_only INT UNSIGNED,'
                      ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      ' section_number INT UNSIGNED,'
                      ' target_position_in_table INT UNSIGNED,'
                      ' table_number INT UNSIGNED,'
                      ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
                      ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
                      ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,'
                      ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,'
                      'INDEX(`target_article_id`),'
                      'INDEX(`source_article_id`)'
                  ') ENGINE=InnoDB;')
    connection.close()
Example #8
0
def req():
    # Get URLs from a text file, remove white space.
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_worker_view = db.get_work_view()
    articles = db_worker_view.retrieve_all_articles()
    #articles = db_worker_view.retrieve_all_articles_questionmark()
    # measure time
    start = time.clock()
    start_time_iteration = start
    iteration_number = 483
    for i, article in enumerate(articles):
        # print some progress
        if i % 10000 == 0:
            #print time for the iteration
            seconds = time.clock() - start_time_iteration
            m, s = divmod(seconds, 60)
            h, m = divmod(m, 60)
            print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % (i, h, m, s)
            start_time_iteration = time.clock()
            iteration_number += 1

        # Thread pool.
        # Blocks other threads (more than the set limit).
        pool.acquire(blocking=True)
        # Create a new thread.
        # Pass each URL (i.e. u parameter) to the worker function.
        t = threading.Thread(target=worker, args=(MEDIAWIKI_API_ENDPOINT+urllib.quote(article['title'])+'/'+str(article['rev_id']), article, iteration_number))

        # Start the newly create thread.
        t.start()
    seconds = time.clock() - start
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    print "Total time: %d:%02d:%02d" % (h, m, s)
Example #9
0
    def run(self):
        self.print_title('This is the interactive runner program')
        self.create_tmp_if_not_exists()

        INPUT_FILE = self.read_path('Please enter the path of the input file [.txt]', default='./tmp/input.txt')
        OUTPUT_FILE = self.read_path('Please enter the path of the output file [.html]', default='./tmp/output.html', must_exist=False)
        LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/runner.log', must_exist=False)


        print '# Starting runner...'
        # setup logging
        LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
        logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')

        # measure time
        start = time.clock()

        # connect to db
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        work_view = db.get_work_view()

        # read input
        f = open(INPUT_FILE, 'r')
        text = f.read()
        text = text.replace(' ', ' ')
        f.close()

        # create dummy article
        article = {}
        article['type'] = 'article'
        article['id'] = None
        article['title'] = None
        article['text'] = text
        article['links'] = []

        # identify links
        link_detector = LinkDetector(work_view)
        link_detector.detect_links(article)
        # identify terms
        #term_identifier = TermIdentifier()
        #article = term_identifier.identify_terms(text)

        # find possible meanings
        meaning_finder = MeaningFinder(work_view)
        meaning_finder.find_meanings(article)

        # calculate relatedness
        relatedness_calculator = RelatednessCalculator(work_view)

        # decide for meaning
        decider = Decider(relatedness_calculator)
        decider.decide(article)

        # output results
        html_outputter = HTMLOutputter()
        html_outputter.output(article, OUTPUT_FILE)

        seconds = round (time.clock() - start)
        print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
Example #10
0
def pickle_correlations_zeros():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn)
    print 'group'
    article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)
    def __init__(self, path):
        #os.environ["DISPLAY"]=":1"
        print path
        os.environ["DISPLAY"]=":1"
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        self.db_build_view = db.get_build_view()
        self.cursor = self.db_build_view._cursor

        self.app = QApplication(sys.argv)
        self.path = path
Example #12
0
    def __init__(self, path):
        #os.environ["DISPLAY"]=":1"
        print path
        os.environ["DISPLAY"] = ":1"
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                           DATABASE_NAME)
        self.db_build_view = db.get_build_view()
        self.cursor = self.db_build_view._cursor

        self.app = QApplication(sys.argv)
        self.path = path
def pickle_category_counts_distribution():
    results =  {}
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_worker_view = db.get_work_view()
    cursor = db_worker_view._cursor
    for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']:
        try:
            cursor.execute('select counts from link_features where counts is not null and visual_region=%s;', (category,))
            result = cursor.fetchall()
            results[category] = result
        except MySQLdb.Error, e:
            print e
Example #14
0
def build_page_length_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `page_length` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,'
                      ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL'
                  ') ENGINE=InnoDB;')
    connection.close()
def build_page_length_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `redirects_candidates_page_length` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,'
                      ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL'
                  ') ENGINE=InnoDB;')
    connection.close()
Example #16
0
    def _create_structure(self):

        # measure time
        start = time.clock()

        # creating structure
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        db.build()

        seconds = round (time.clock() - start)
        logging.info('Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60))
        print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
def pickle_aggregated_counts_distribution():

    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_worker_view = db.get_work_view()
    cursor = db_worker_view._cursor
    results = {}
    try:
        cursor.execute('select sum(counts) from clickstream_derived_internal_links group by prev_id;')
        result = cursor.fetchall()
        results['source_article']=result
    except MySQLdb.Error, e:
        print e
Example #18
0
def pickle_vis_data_pandas():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()

    df = pd.read_sql(
        'select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features',
        conn)
    print len(df)

    no_dup = df.sort([
        'source_article_id', 'target_y_coord_1920_1080',
        'target_x_coord_1920_1080'
    ]).groupby(["source_article_id", "target_article_id"]).first()
    print len(no_dup)

    feature = no_dup.loc[no_dup['visual_region'] == 'lead']
    print len(feature)
    feature.reset_index(inplace=True)

    feature = no_dup.loc[no_dup['visual_region'] == 'infobox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv',
                                          sep='\t',
                                          index=False)

    feature = no_dup.loc[no_dup['visual_region'] == 'navbox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv',
                                          sep='\t',
                                          index=False)

    feature = no_dup.loc[no_dup['visual_region'] == 'left-body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv',
                                          sep='\t',
                                          index=False)

    feature = no_dup.loc[no_dup['visual_region'] == 'body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv',
                                          sep='\t',
                                          index=False)
Example #19
0
def plot_degree_filtered_sql():
    print 'before select'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute(
        'SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in '
        ' (select distinct prev_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'result len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000 == 0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network,
                        vfilt=lambda v: v.out_degree() + v.in_degree() > 0)
    print 'before save'
    network.save(
        "output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz")
    print 'done'

    cursor.execute(
        'SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in '
        ' (select distinct curr_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'resutl len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000 == 0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network,
                        vfilt=lambda v: v.out_degree() + v.in_degree() > 0)
    print 'before save'
    network.save(
        "output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz")
    print 'done'
Example #20
0
def links_heatmap():
    #http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
    # Get URLs from a text file, remove white space.
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_worker_view = db.get_work_view()
    coords = db_worker_view.retrieve_all_links_coords()
    print 'coord loaded'
    x=[]
    y=[]

    page_lenghts = db_worker_view.retrieve_all_page_lengths()
    print 'lenghts loaded'
    for coord in coords:
        x_normed = float(coord['x'])/float(1920)
        y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']])
        if  x_normed <=1.0 and y_normed <=1.0:
            x.append(x_normed)
            y.append(y_normed)



    heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
    extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]

    fig_size = (2.4, 2)
    #fig_size = (3.5, 3)
    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Log Normalized")

    plt.show()
    plt.savefig('output/links_heatmap_lognormed_self_loop.pdf')

    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Normalized")

    plt.show()
    plt.savefig('output/links_heatmap_normed_self_loop.pdf')

    print "done"
Example #21
0
def correlations(network_name):
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    # wikipedia  graph  structural statistics

    results = None
    try:
        results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
        results = cursor.fetchall()


    except MySQLdb.Error, e:
        print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
Example #22
0
    def _create_structure(self):

        # measure time
        start = time.clock()

        # creating structure
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                           DATABASE_NAME)
        db.build()

        seconds = round(time.clock() - start)
        logging.info('Finished after %02d:%02d minutes' %
                     (seconds / 60, seconds % 60))
        print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
Example #23
0
def build_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `table_css_class` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
                      '`source_article_id` BIGINT UNSIGNED NOT NULL,'
                      ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      'INDEX(`source_article_id`)'
                  ') ENGINE=InnoDB;')
    connection.close()
Example #24
0
def pickle_category_counts_distribution():
    results = {}
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    db_worker_view = db.get_work_view()
    cursor = db_worker_view._cursor
    for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']:
        try:
            cursor.execute(
                'select counts from link_features where counts is not null and visual_region=%s;',
                (category, ))
            result = cursor.fetchall()
            results[category] = result
        except MySQLdb.Error, e:
            print e
def pickle_redirects_ids():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_work_view = db.get_work_view()

    redirects_list_id = []
    with open(HOME+"data/candidate_articles.tsv") as f:
        next(f)
        for line in f:
            line = line.strip().split('\t')
            #look up id
            tmp = db_work_view.resolve_title(line[0].replace('_',' '))
            #print tmp
            if tmp is not None:
               redirects_list_id.append(tmp['id'])
    pickle.dump(redirects_list_id, open(SSD_HOME+"pickle/redirects_ids.obj", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
Example #26
0
def pickle_aggregated_counts_distribution():

    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    db_worker_view = db.get_work_view()
    cursor = db_worker_view._cursor
    results = {}
    try:
        cursor.execute(
            'select sum(counts) from clickstream_derived_internal_links group by prev_id;'
        )
        result = cursor.fetchall()
        results['source_article'] = result
    except MySQLdb.Error, e:
        print e
Example #27
0
def build_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute(
        'CREATE TABLE `table_css_class` ('
        '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
        '`source_article_id` BIGINT UNSIGNED NOT NULL,'
        ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
        'INDEX(`source_article_id`)'
        ') ENGINE=InnoDB;')
    connection.close()
Example #28
0
def pickle_correlations_zeros_january():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
    print 'loaded links'
    df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501  where link_type_derived= "internal-link";',  conn)
    print 'loaded counts'
    result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
    print 'merged counts'
    print result
    article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    article_counts['counts'].fillna(0.0, inplace=True)
    print article_counts
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
Example #29
0
def clicks_heatmap_total():
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_worker_view = db.get_work_view()
    coords = db_worker_view.retrieve_all_links_coords_clicks()
    print 'coord loaded'
    links = {}
    x = []
    y = []
    values = []
    for coord in coords:
        x_normed = float(coord['x'])/float(1920)
        y_normed = float(coord['y'])/float(coord['page_length'])
        if x_normed <=1.0 and y_normed <=1.0:
            x.append(x_normed)
            y.append(y_normed)
            values.append(float(coord['counts']))

    heatmap, xedges, yedges = np.histogram2d(x, y, bins=100, weights=values)
    extent = [xedges[0], xedges[-1], yedges[-1], yedges[0] ]


    fig_size = (2.4, 2)

    plt.clf()
    plt.figure(figsize=fig_size)

    plt.grid(True)
    plt.imshow(heatmap , extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Clicks Heatmap Log Normalized")

    plt.show()
    plt.savefig('output/clicks_heatmap_lognormed_self_loop_total.pdf')

    plt.clf()
    plt.figure(figsize=fig_size)

    plt.grid(True)
    plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Clicks Heatmap Normalized")

    plt.show()
    plt.savefig('output/clicks_heatmap_normed_self_loop_total.pdf')
    print "done"
Example #30
0
def rbo():
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    sm = []
    try:
        cursor.execute('select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm",))
        result = cursor.fetchall()
        for row in result:
            record = {}
            record['curr_id']= row[0]
            record['counts_sum'] = row[1]
            record['curr_title'] = row[2]
            sm.append(row[0])
    except MySQLdb.Error, e:
        print e
Example #31
0
    def _extract_articles(self):

        INPUT_FILE = WIKI_DUMP_XML_FILE  #self.read_path('Please enter the path of the wiki dump file [.xml]')
        #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]')
        MAX_ARTICLES_IN_QUEUE = 200  #self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000)
        NUM_THREADS = 1  #self.read_number('How many threads shall be used to write to the database?', 20, 1, 50)
        CONTINUE = True  #self.read_yes_no('This process might take several days to finish.\nDo you want to continue?')

        if CONTINUE:
            # measure time
            start = time.clock()

            # connect to database and create article queue
            db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                               DATABASE_NAME)
            queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE)

            # create reader and threads
            reader = WikipediaReader(INPUT_FILE, queue, extract_text=False)
            threads = []
            for i in range(0, NUM_THREADS):
                inserter = ArticleInserter(queue, db.get_build_view())
                threads.append(inserter)

            # start reader
            reader.start()

            # start insert threads
            for thread in threads:
                thread.start()

            # wait for reading thread, queue and inserters to be done
            reader.join()
            queue.join()
            for thread in threads:
                thread.end()
            for thread in threads:
                thread.join()

            seconds = round(time.clock() - start)
            print 'Finished after %02d:%02d minutes' % (seconds / 60,
                                                        seconds % 60)

        else:
            print 'Aborting...'
Example #32
0
def pickle_redirects_ids():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    db_work_view = db.get_work_view()

    redirects_list_id = []
    with open(HOME + "data/candidate_articles.tsv") as f:
        next(f)
        for line in f:
            line = line.strip().split('\t')
            #look up id
            tmp = db_work_view.resolve_title(line[0].replace('_', ' '))
            #print tmp
            if tmp is not None:
                redirects_list_id.append(tmp['id'])
    pickle.dump(redirects_list_id,
                open(SSD_HOME + "pickle/redirects_ids.obj", "wb"),
                protocol=pickle.HIGHEST_PROTOCOL)
def plot_degree_filtered_sql():
    print 'before select'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in '
                   ' (select distinct prev_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'result len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000==0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )
    print 'before save'
    network.save("output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz")
    print 'done'

    cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in '
                   ' (select distinct curr_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'resutl len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000==0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )
    print 'before save'
    network.save("output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz")
    print 'done'
Example #34
0
    def _extract_articles(self):


        INPUT_FILE = WIKI_DUMP_XML_FILE #self.read_path('Please enter the path of the wiki dump file [.xml]')
        #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]')
        MAX_ARTICLES_IN_QUEUE = 200#self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000)
        NUM_THREADS = 1#self.read_number('How many threads shall be used to write to the database?', 20, 1, 50)
        CONTINUE = True#self.read_yes_no('This process might take several days to finish.\nDo you want to continue?')

        if CONTINUE:
            # measure time
            start = time.clock()

            # connect to database and create article queue
            db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
            queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE)

            # create reader and threads
            reader = WikipediaReader(INPUT_FILE, queue, extract_text=False)
            threads = []
            for i in range(0, NUM_THREADS):
                inserter = ArticleInserter(queue, db.get_build_view())
                threads.append(inserter)

            # start reader
            reader.start()

            # start insert threads
            for thread in threads:
                thread.start()

            # wait for reading thread, queue and inserters to be done
            reader.join()
            queue.join()
            for thread in threads:
                thread.end()
            for thread in threads:
                thread.join()

            seconds = round (time.clock() - start)
            print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)

        else:
            print 'Aborting...'
Example #35
0
def rbo():
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    sm = []
    try:
        cursor.execute(
            'select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;',
            ("entry-sm", ))
        result = cursor.fetchall()
        for row in result:
            record = {}
            record['curr_id'] = row[0]
            record['counts_sum'] = row[1]
            record['curr_title'] = row[2]
            sm.append(row[0])
    except MySQLdb.Error, e:
        print e
def export_data_unresolved():

    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_work_view = db.get_work_view()
    connection = db_work_view._db_connection


    df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False)

    df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ')
    df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ')
    df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ')


    df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection)


    sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved'])

    sample_unresoleved['n'].fillna(0, inplace=True)
    sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
Example #37
0
def req():
    # Get URLs from a text file, remove white space.
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    db_worker_view = db.get_work_view()
    articles = db_worker_view.retrieve_all_articles()
    #articles = db_worker_view.retrieve_all_articles_questionmark()
    # measure time
    start = time.clock()
    start_time_iteration = start
    iteration_number = 483
    for i, article in enumerate(articles):
        # print some progress
        if i % 10000 == 0:
            #print time for the iteration
            seconds = time.clock() - start_time_iteration
            m, s = divmod(seconds, 60)
            h, m = divmod(m, 60)
            print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % (
                i, h, m, s)
            start_time_iteration = time.clock()
            iteration_number += 1

        # Thread pool.
        # Blocks other threads (more than the set limit).
        pool.acquire(blocking=True)
        # Create a new thread.
        # Pass each URL (i.e. u parameter) to the worker function.
        t = threading.Thread(
            target=worker,
            args=(MEDIAWIKI_API_ENDPOINT + urllib.quote(article['title']) +
                  '/' + str(article['rev_id']), article, iteration_number))

        # Start the newly create thread.
        t.start()
    seconds = time.clock() - start
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    print "Total time: %d:%02d:%02d" % (h, m, s)
Example #38
0
def correlations_zeros(labels, consider_zeros=True, clickstream_data='', struct=False):
    #load network
    print struct
    name = '_'.join(labels)
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    #read counts with zeros
    if consider_zeros:
        article_counts  =  pd.read_csv(TMP+clickstream_data+'article_counts.tsv', sep='\t')
        print TMP+clickstream_data+'article_counts.tsv'
        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                page_rank_values = list()
                counts = list()
                correlations_values = {}
                for index, row in article_counts.iterrows():
                    counts.append(float(row['counts']))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations_values['pearson']=p
                print 'spearmanr'
                s = spearmanr(page_rank_values, counts)
                print s
                correlations_values['spearmanr']=s
                print 'kendalltau'
                k = kendalltau(page_rank_values, counts)
                print k
                correlations_values['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations_values

        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_'+name+'.obj', correlations_weighted_pagerank)
    else:
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        conn = db._create_connection()
        cursor = conn.cursor()
        # wikipedia  graph  structural statistics

        results = None
        try:
            if clickstream_data != '':

                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()
            else:
                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived_en_201501 c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()

        except MySQLdb.Error, e:
            print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
        print 'after sql load'


        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                correlations={}
                counts=[]
                page_rank_values=[]
                for row in results:
                    counts.append(float(row[1]))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations['pearson']=p
                print 'spearmanr'
                s= spearmanr(page_rank_values, counts)
                print s
                correlations['spearmanr']=s
                print 'kendalltau'
                k= kendalltau(page_rank_values, counts)
                print k
                correlations['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations



        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_without_zeros'+name+'.obj', correlations_weighted_pagerank)
Example #39
0
def print_table():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    df = pd.read_sql('select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, '
                     'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, '
                     'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080', conn)

    print "dup"
    #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    no_dup = df.groupby(["source_article_id", "target_article_id"]).first()

    no_dup = no_dup.reset_index()
    print "no dup"
    del df
    #print no_dup
    df_top = pd.read_sql("select source_article_id, target_article_id, sim as topic_similarity  from topic_similarity", conn)
    print "no up"
    topDF = df_top.groupby("source_article_id", as_index=False)["topic_similarity"].median()
    #print topDF
    print "no up1"
    topDF.columns = ["source_article_id", "topic_similarity_article_median"]
    #print topDF
    print "no up2"
    df_top = df_top.merge(topDF, on="source_article_id")
    #print df_top[(df_top['topic_similarity_article_median'] >0)]
    print "no up3"

    df_sem = pd.read_sql("select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity", conn)
    print "no up4"
    semDF = df_sem.groupby("source_article_id", as_index=False)["sem_similarity"].median()
    #rename
    print "no up5"
    semDF.columns = ["source_article_id", "sem_similarity_article_median"]
    print "no up6"
    #print df_top
    df_sem = df_sem.merge(semDF, on="source_article_id")
    #print len(df_sem)
    print "no up7"
    df1 = no_dup.merge(df_sem[['source_article_id', 'sem_similarity', 'sem_similarity_article_median']], on="source_article_id")
    #print no_dup
    del df_sem, semDF
    df = no_dup.merge(df_top[['source_article_id', 'topic_similarity', 'topic_similarity_article_median']], on="source_article_id")
    print "no up9"
    del no_dup
    del df_top, topDF

    table = ""

    table += resultTableLine (df, "src_degr > target_degr", "df.rel_degree > 0")
    table += resultTableLine (df, "src_degr <= target_degr", "df.rel_degree <= 0")


    table += resultTableLine (df, "src_in_degr > target_in_degr", "df.rel_in_degree > 0")
    table += resultTableLine (df, "src_in_degr <= target_in_degr", "df.rel_in_degree <= 0")


    table += resultTableLine (df, "src_out_degr > target_out_degr", "df.rel_out_degree > 0")
    table += resultTableLine (df, "src_out_degr <= target_out_degr", "df.rel_out_degree <= 0")

    table += resultTableLine (df, "src_kcore > target_kcore", "df.rel_kcore > 0")
    table += resultTableLine (df, "src_kcore <= target_kcore", "df.rel_kcore <= 0")

    table += resultTableLine (df, "src_page_rank > target_page_rank", "df.rel_page_rank > 0")
    table += resultTableLine (df, "src_page_rank <= target_page_rank", "df.rel_page_rank <= 0")


    table += resultTableLine (df1, "text_sim > median(text_sim) of page", "df.sem_similarity > df.sem_similarity_article_median")
    table += resultTableLine (df1, "text_sim <= median(text_sim) of page", "df.sem_similarity <= df.sem_similarity_article_median")

    table += resultTableLine (df, "topic_sim > median(topic_sim) of page", "df.topic_similarity > df.topic_similarity_article_median")
    table += resultTableLine (df, "topic_sim <= median(topic_sim) of page", "df.topic_similarity <= df.topic_similarity_article_median")


    table += resultTableLine (df, "left third of screen", "df.target_x_coord_1920_1080 <= 360")
    table += resultTableLine (df, "middle third of screen", "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)")
    table += resultTableLine (df, "right third of screen", "df.target_x_coord_1920_1080 > 720")

    table += resultTableLine (df, "position = lead", "df.visual_region == 'lead'")
    table += resultTableLine (df, "position = body", "(df.visual_region == 'body') | (df.visual_region == 'left-body')")
    table += resultTableLine (df, "position = navbox", "df.visual_region == 'navbox'")
    #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'")
    table += resultTableLine (df, "position = infobox", "df.visual_region == 'infobox'")


    print table
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *

db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                   DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
cursor.execute(
    'SELECT source_article_id, target_article_id FROM link_occurences;')
result = cursor.fetchall()
wikipedia = Graph()

for link in result:
    wikipedia.add_edge(link[0], link[1])

# filter all nodes that have no edges
wikipedia = GraphView(wikipedia,
                      vfilt=lambda v: v.out_degree() + v.in_degree() > 0)

print "clust"
wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia)

print "page_rank"
wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia)

print "eigenvector_centr"
eigenvalue, eigenvectorcentr = eigenvector(wikipedia)
wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr

print "kcore"
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *


db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences;')
result = cursor.fetchall()
wikipedia = Graph()

for link in result:
    wikipedia.add_edge(link[0], link[1])

# filter all nodes that have no edges
wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )

print "clust"
wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia)

print "page_rank"
wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia)

print "eigenvector_centr"
eigenvalue, eigenvectorcentr = eigenvector(wikipedia)
wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr

print "kcore"
wikipedia.vertex_properties["kcore"] = kcore_decomposition(wikipedia)
Example #42
0
def links_heatmap_rel_prob():
    #http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
    # Get URLs from a text file, remove white space.
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_worker_view = db.get_work_view()
    coords = db_worker_view.retrieve_all_links_coords()

    x=[]
    y=[]


    page_lenghts = db_worker_view.retrieve_all_page_lengths()

    for coord in coords:
        x_normed = float(coord['x'])/float(1920)
        y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']])
        if  x_normed <=1.0 and y_normed <=1.0:
            x.append(x_normed)
            y.append(y_normed)



    links_heatmap_hist, xedges, yedges = np.histogram2d(x, y, normed=True,  bins=100)
    links_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]




    coords = db_worker_view.retrieve_all_links_coords_clicks()
    print 'coord loaded'
    links = {}
    x = []
    y = []
    values = []
    for coord in coords:
       try:
           v = links[coord['key']]
           links[coord['key']]+=1
       except:
           links[coord['key']]=0
    for coord in coords:
       x_normed = float(coord['x'])/float(1920)
       y_normed = float(coord['y'])/float(coord['page_length'])
       if  x_normed <=1.0 and y_normed <=1.0:
           x.append(x_normed)
           y.append(y_normed)
           if links[coord['key']]==0:
	       #x.append(x_normed)
               #y.append(y_normed)
               values.append(float(coord['counts']))
           else:
               values.append(float(coord['counts'])/float(links[coord['key']]))

    clicks_heatmap_hist, xedges, yedges = np.histogram2d(x, y, bins=100, normed=True, weights=values)
    clicks_extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]

    substraction_hist = np.subtract(clicks_heatmap_hist,links_heatmap_hist)
    #rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist)
    with np.errstate(divide='ignore', invalid='ignore'):
        rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist)
        rel_prob_hist[rel_prob_hist == np.inf] = 0
        rel_prob_hist = np.nan_to_num(rel_prob_hist)

    fig_size = (2.4, 2)

    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(substraction_hist, extent=clicks_extent, origin='upper',norm=Normalize(), cmap=plt.get_cmap('jet'))
    plt.colorbar()


    plt.show()
    plt.savefig('output/clicks-links_heatmap_normed_self_loop.pdf')


    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
    plt.colorbar()


    plt.show()
    plt.savefig('output/clicks_over_links_heatmap_normed_self_loop.pdf')


    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()


    plt.show()
    plt.savefig('output/clicks-links_heatmap_lognormed_self_loop.pdf')


    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()


    plt.show()
    plt.savefig('output/clicks_over_links_heatmap_lognormed_self_loop.pdf')


    substraction_hist = np.subtract(links_heatmap_hist, clicks_heatmap_hist)
    #rel_prob_hist = np.divide(clicks_heatmap_hist, links_heatmap_hist)
    with np.errstate(divide='ignore', invalid='ignore'):
        rel_prob_hist = np.divide(links_heatmap_hist, clicks_heatmap_hist)
        rel_prob_hist[rel_prob_hist == np.inf] = 0
        rel_prob_hist = np.nan_to_num(rel_prob_hist)



    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Normalized")

    plt.show()
    plt.savefig('output/links-clicks_heatmap_normed_self_loop.pdf')


    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Normalized")

    plt.show()
    plt.savefig('output/links_over_clicks_heatmap_normed_self_loop.pdf')

    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(substraction_hist, extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Normalized")

    plt.show()
    plt.savefig('output/links-clicks_heatmap_lognormed_self_loop.pdf')


    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(rel_prob_hist , extent=clicks_extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Normalized")

    plt.show()
    plt.savefig('output/links_over_clicks_heatmap_lognormed_self_loop.pdf')
    print "done"
Example #43
0
def multiple_links_heatmap():
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_worker_view = db.get_work_view()
    coords = db_worker_view.retrieve_all_links_multpile_occ()
    print 'coord loaded'
    page_lenghts = db_worker_view.retrieve_all_page_lengths()
    print 'lenghts loaded'
    links = {}
    x = []
    y = []
    x_conf = []
    y_conf = []
    x_not_conf = []
    y_not_conf = []
    number_of_not_confident_clicks=0
    number_of_confident_clicks = 0
    number_of_valid_normed_links=0
    for coord in coords:
        try:
            v = links[coord['key']]
            links[coord['key']]+=1
        except:
            links[coord['key']]=0
    for coord in coords:
        x_normed = float(coord['x'])/float(1920)
        y_normed = float(coord['y'])/float(page_lenghts[coord['key'][0]])
        if  x_normed <=1.0 and y_normed <=1.0:
            x.append(x_normed)
            y.append(y_normed)
            number_of_valid_normed_links+=1
            if links[coord['key']]==0:
                x_conf.append(x_normed)
                y_conf.append(y_normed)
                number_of_confident_clicks+=1
            else:
                x_not_conf.append(x_normed)
                y_not_conf.append(y_normed)
                number_of_not_confident_clicks+=1
    print '###########'
    print number_of_confident_clicks
    print number_of_not_confident_clicks
    print number_of_valid_normed_links
    print len(coords)
    print '###########'



    heatmap, xedges, yedges = np.histogram2d(x_conf, y_conf, bins=100)
    extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]

    fig_size = (2.4, 2)
    #fig_size = (3.5, 3)
    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Log Normalized")

    plt.show()
    plt.savefig('output/links_heatmap_lognormed_self_loop_unique.pdf')

    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Normalized")

    plt.show()
    plt.savefig('output/links_heatmap_normed_self_loop_unique.pdf')

    print "unique done"

    heatmap, xedges, yedges = np.histogram2d(x_not_conf, y_not_conf, bins=100)
    extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]

    fig_size = (2.4, 2)
    #fig_size = (3.5, 3)
    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Log Normalized")

    plt.show()
    plt.savefig('output/links_heatmap_lognormed_self_loop_multiple.pdf')

    plt.clf()
    plt.figure(figsize=fig_size)
    plt.grid(True)

    plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
    plt.colorbar()
    #plt.title("Links Heatmap Normalized")

    plt.show()
    plt.savefig('output/links_heatmap_normed_self_loop_multiple.pdf')

    print "done"
Example #44
0
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *
__author__ = 'dimitrovdr'

db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                   DATABASE_NAME)
db_work_view = db.get_work_view()

wikipedia = Graph()

for link in db_work_view.retrieve_all_internal_transitions_counts():
    for i in range(int(link['counts'])):
        wikipedia.add_edge(link['from'], link['to'])

    #print 'from %s, to %s', link['from'], link['to']

#wikipedia.save("output/transitionsnetwork.xml.gz")

# filter all nodes that have no edges
transitions_network = GraphView(
    wikipedia, vfilt=lambda v: v.out_degree() + v.in_degree() > 0)

transitions_network.save("output/transitionsnetworkweighted.xml.gz")

print "Stats for transitions network:"
print "number of nodes: %d" % transitions_network.num_vertices()
print "number of edges: %d" % transitions_network.num_edges()
def print_table():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()

    df = pd.read_sql(
        'select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, '
        'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, '
        'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080',
        conn)

    print "dup"
    #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    no_dup = df.groupby(["source_article_id", "target_article_id"]).first()

    no_dup = no_dup.reset_index()
    print "no dup"
    del df
    #print no_dup
    df_top = pd.read_sql(
        "select source_article_id, target_article_id, sim as topic_similarity  from topic_similarity",
        conn)
    print "no up"
    topDF = df_top.groupby("source_article_id",
                           as_index=False)["topic_similarity"].median()
    #print topDF
    print "no up1"
    topDF.columns = ["source_article_id", "topic_similarity_article_median"]
    #print topDF
    print "no up2"
    df_top = df_top.merge(topDF, on="source_article_id")
    #print df_top[(df_top['topic_similarity_article_median'] >0)]
    print "no up3"

    df_sem = pd.read_sql(
        "select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity",
        conn)
    print "no up4"
    semDF = df_sem.groupby("source_article_id",
                           as_index=False)["sem_similarity"].median()
    #rename
    print "no up5"
    semDF.columns = ["source_article_id", "sem_similarity_article_median"]
    print "no up6"
    #print df_top
    df_sem = df_sem.merge(semDF, on="source_article_id")
    #print len(df_sem)
    print "no up7"
    df1 = no_dup.merge(df_sem[[
        'source_article_id', 'sem_similarity', 'sem_similarity_article_median'
    ]],
                       on="source_article_id")
    #print no_dup
    del df_sem, semDF
    df = no_dup.merge(df_top[[
        'source_article_id', 'topic_similarity',
        'topic_similarity_article_median'
    ]],
                      on="source_article_id")
    print "no up9"
    del no_dup
    del df_top, topDF

    table = ""

    table += resultTableLine(df, "src_degr > target_degr", "df.rel_degree > 0")
    table += resultTableLine(df, "src_degr <= target_degr",
                             "df.rel_degree <= 0")

    table += resultTableLine(df, "src_in_degr > target_in_degr",
                             "df.rel_in_degree > 0")
    table += resultTableLine(df, "src_in_degr <= target_in_degr",
                             "df.rel_in_degree <= 0")

    table += resultTableLine(df, "src_out_degr > target_out_degr",
                             "df.rel_out_degree > 0")
    table += resultTableLine(df, "src_out_degr <= target_out_degr",
                             "df.rel_out_degree <= 0")

    table += resultTableLine(df, "src_kcore > target_kcore",
                             "df.rel_kcore > 0")
    table += resultTableLine(df, "src_kcore <= target_kcore",
                             "df.rel_kcore <= 0")

    table += resultTableLine(df, "src_page_rank > target_page_rank",
                             "df.rel_page_rank > 0")
    table += resultTableLine(df, "src_page_rank <= target_page_rank",
                             "df.rel_page_rank <= 0")

    table += resultTableLine(
        df1, "text_sim > median(text_sim) of page",
        "df.sem_similarity > df.sem_similarity_article_median")
    table += resultTableLine(
        df1, "text_sim <= median(text_sim) of page",
        "df.sem_similarity <= df.sem_similarity_article_median")

    table += resultTableLine(
        df, "topic_sim > median(topic_sim) of page",
        "df.topic_similarity > df.topic_similarity_article_median")
    table += resultTableLine(
        df, "topic_sim <= median(topic_sim) of page",
        "df.topic_similarity <= df.topic_similarity_article_median")

    table += resultTableLine(df, "left third of screen",
                             "df.target_x_coord_1920_1080 <= 360")
    table += resultTableLine(
        df, "middle third of screen",
        "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)"
    )
    table += resultTableLine(df, "right third of screen",
                             "df.target_x_coord_1920_1080 > 720")

    table += resultTableLine(df, "position = lead",
                             "df.visual_region == 'lead'")
    table += resultTableLine(
        df, "position = body",
        "(df.visual_region == 'body') | (df.visual_region == 'left-body')")
    table += resultTableLine(df, "position = navbox",
                             "df.visual_region == 'navbox'")
    #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'")
    table += resultTableLine(df, "position = infobox",
                             "df.visual_region == 'infobox'")

    print table
Example #46
0
def weighted_pagerank_hyp_engineering(labels):

    #read vocab, graph
    graph =  read_pickle(SSD_HOME+"pickle/graph")
    print "loaded graph"
    values =  read_pickle(SSD_HOME+"pickle/values")
    values_kcore = read_pickle(SSD_HOME+"pickle/values_kcore")

    # transform kcore values to model going out of the kcore
    values_kcore = [1./np.sqrt(float(x)) for x in values_kcore]
    print 'kcore values tranfsormation'

    #sem_sim_hyp = read_pickle(SSD_HOME+"pickle/sem_sim_hyp")
    #print "sem_sim_hyp values"

    #lead_hyp = read_pickle(SSD_HOME+"pickle/lead_hyp")
    #infobox_hyp = read_pickle(SSD_HOME+"pickle/infobox_hyp")
    #left_body_hyp = read_pickle(SSD_HOME+"pickle/left-body_hyp")
    #print "gamma values"

    vocab = read_pickle(SSD_HOME+"pickle/vocab")
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)


    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape, dtype=np.float)


    hyp_kcore = csr_matrix((values_kcore, (graph[0], graph[1])),
                           shape=shape, dtype=np.float)
    print "hyp_kcore"

    del graph
    del values_kcore

    print "after delete"


    #read sem sim form db and create hyp
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id, sim from semantic_similarity', conn)
    print 'map sem sim'
    sem_sim_hyp_i = map_to_hyp_indicies(vocab, df['source_article_id'])
    sem_sim_hyp_j = map_to_hyp_indicies(vocab, df['target_article_id'])

    hyp_sem_sim = csr_matrix((df['sim'].values, (sem_sim_hyp_i, sem_sim_hyp_j)),
                             shape=shape, dtype=np.float)
    print 'done map sem sim'
    print hyp_sem_sim.shape
    del sem_sim_hyp_i
    del sem_sim_hyp_j
    del df

    #read vis form csv and create hyp
    lead = pd.read_csv(TMP+'lead.tsv',sep='\t')
    lead_i = map_to_hyp_indicies(vocab, lead['source_article_id'])
    lead_j = map_to_hyp_indicies(vocab, lead['target_article_id'])
    lead_v = np.ones(len(lead_i), dtype=np.float)
    
    hyp_lead = csr_matrix((lead_v, (lead_i, lead_j)),
                            shape=shape, dtype=np.float)
    print 'done map lead'
    print hyp_lead.shape
    del lead
    del lead_i
    del lead_j
    del lead_v

    infobox = pd.read_csv(TMP+'infobox.tsv',sep='\t')
    infobox_i = map_to_hyp_indicies(vocab, infobox['source_article_id'])
    infobox_j = map_to_hyp_indicies(vocab, infobox['target_article_id'])
    infobox_v = np.ones(len(infobox_i), dtype=np.float)

    hyp_infobox = csr_matrix((infobox_v, (infobox_i, infobox_j)),
                             shape=shape, dtype=np.float)
    print 'done map infobox'
    print hyp_infobox.shape
    del infobox
    del infobox_i
    del infobox_j
    del infobox_v

    left_body = pd.read_csv(TMP+'left-body.tsv',sep='\t')
    left_body_i = map_to_hyp_indicies(vocab, left_body['source_article_id'])
    left_body_j = map_to_hyp_indicies(vocab, left_body['target_article_id'])
    left_body_v = np.ones(len(left_body_i), dtype=np.float)

    hyp_left_body = csr_matrix((left_body_v, (left_body_i, left_body_j)),
                               shape=shape, dtype=np.float)
    print 'done map infobox'
    print hyp_left_body.shape
    del left_body
    del left_body_i
    del left_body_j
    del left_body_v

    #add the visual hyps to one matrix and set all non zero fields to 1.0
    print 'before gamma'
    hyp_gamma = hyp_left_body + hyp_infobox + hyp_lead
    hyp_gamma.data = np.ones_like(hyp_gamma.data, dtype=np.float)
    print 'after gamma'

    del hyp_left_body
    del hyp_infobox
    del hyp_lead

    #norm
    print "in norm each "
    hyp_structural = norm(hyp_structural)
    hyp_kcore = norm(hyp_kcore)
    hyp_sem_sim = norm(hyp_sem_sim)
    hyp_gamma = norm(hyp_gamma)

    #engineering of hypos and norm again
    hyp_kcore_struct = norm(hyp_structural + hyp_kcore)
    hyp_visual_struct = norm(hyp_structural + hyp_gamma)
    hyp_sem_sim_struct = norm(hyp_structural + hyp_sem_sim)

    hyp_mix_semsim_kcore = norm(hyp_kcore + hyp_sem_sim)
    hyp_mix_semsim_visual = norm(hyp_sem_sim + hyp_gamma)
    hyp_mix_kcore_visual= norm(hyp_kcore + hyp_gamma)


    hyp_all = norm(hyp_kcore + hyp_sem_sim + hyp_gamma)
    hyp_all_struct =  norm(hyp_kcore + hyp_sem_sim + hyp_gamma + hyp_structural)

    hyp_semsim_struct = norm(hyp_structural + hyp_kcore)

    print 'test hypos'


    hypos={}
    hypos['hyp_kcore']=hyp_kcore
    hypos['hyp_sem_sim']=hyp_sem_sim
    hypos['hyp_visual']=hyp_gamma

    hypos['hyp_kcore_struct']=hyp_kcore_struct
    hypos['hyp_visual_struct']=hyp_visual_struct
    hypos['hyp_sem_sim_struct']=hyp_sem_sim_struct

    hypos['hyp_mix_semsim_kcore']=hyp_mix_semsim_kcore
    hypos['hyp_mix_semsim_visual']=hyp_mix_semsim_visual
    hypos['hyp_mix_kcore_visual']=hyp_mix_kcore_visual

    hypos['hyp_all']=hyp_all
    hypos['hyp_all_struct']=hyp_all_struct



    #load network
    print "weighted page rank engineering"
    wikipedia = load_graph("output/wikipedianetwork.xml.gz")

    #for label, hyp in hypos.iteritems():
    name = '_'.join(labels)
    for label in labels:
        print label
        eprop = create_eprop(wikipedia,  hypos[label], vocab)
        wikipedia.edge_properties[label]=eprop
        #for damping in [0.8, 0.85, 0.9 ,0.95]:
        for damping in [0.85]:
            key = label+"_page_rank_weighted_"+str(damping)
            print key
            wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop, damping=damping)
        print 'save network'

        wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")

    print 'save network'
    wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    print 'done'
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *
__author__ = 'dimitrovdr'


db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_work_view = db.get_work_view()

wikipedia = Graph()

for link in db_work_view.retrieve_all_internal_transitions():
    wikipedia.add_edge(link['from'], link['to'])
    #print 'from %s, to %s', link['from'], link['to']



#wikipedia.save("output/transitionsnetwork.xml.gz")

# filter all nodes that have no edges
transitions_network = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )

print "clust"
transitions_network.vertex_properties["local_clust"] = local_clustering(transitions_network)

print "page_rank"
transitions_network.vertex_properties["page_rank"] = pagerank(transitions_network)

print "eigenvector_centr"
eigenvalue, eigenvectorcentr = eigenvector(transitions_network)
transitions_network.vertex_properties["eigenvector_centr"] = eigenvectorcentr
Example #48
0
def weighted_pagerank():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT source_article_id, target_article_id, occ FROM link_occurences;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("int")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)




    wikipedia.save("output/weightedpagerank/wikipedianetwork_link_occ.xml.gz")
    print 'link_occ done'


    cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity group by '
                   'source_article_id, target_article_id;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("double")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    print 'filter nodes graph tool specific code'
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)


    wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz")
    print 'sem sim distrinct links done'

    cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("double")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)

    wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim.xml.gz")
    print 'sem_sim done'