def build_links_position_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `redirects_candidates` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
                      '`source_article_id` BIGINT UNSIGNED NOT NULL,'
                      '`target_article_id` BIGINT UNSIGNED NULL,'
                      '`target_article_name` VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      ' target_position_in_text INT UNSIGNED NOT NULL,'
                      ' target_position_in_text_only INT UNSIGNED,'
                      ' target_position_in_section INT UNSIGNED,'
                      ' target_position_in_section_in_text_only INT UNSIGNED,'
                      ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      ' section_number INT UNSIGNED,'
                      ' target_position_in_table INT UNSIGNED,'
                      ' table_number INT UNSIGNED,'
                      ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
                      ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
                      ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,'
                      ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,'
                      'INDEX(`target_article_id`),'
                      'INDEX(`source_article_id`)'
                  ') ENGINE=InnoDB;')
    connection.close()
Exemple #2
0
def pickle_vis_data_pandas():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()


    df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn)
    print len(df)

    no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    print len(no_dup)

    feature = no_dup.loc[no_dup['visual_region']=='lead']
    print len(feature)
    feature.reset_index(inplace=True)


    feature = no_dup.loc[no_dup['visual_region']=='infobox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='navbox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='left-body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False)

    feature = no_dup.loc[no_dup['visual_region']=='body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
Exemple #3
0
def build_links_position_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute(
        'CREATE TABLE `links` ('
        '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
        '`source_article_id` BIGINT UNSIGNED NOT NULL,'
        '`target_article_id` BIGINT UNSIGNED NOT NULL,'
        ' target_position_in_text INT UNSIGNED NOT NULL,'
        ' target_position_in_text_only INT UNSIGNED,'
        ' target_position_in_section INT UNSIGNED,'
        ' target_position_in_section_in_text_only INT UNSIGNED,'
        ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
        ' section_number INT UNSIGNED,'
        ' target_position_in_table INT UNSIGNED,'
        ' table_number INT UNSIGNED,'
        ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
        ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
        ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,'
        ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,'
        'INDEX(`target_article_id`),'
        'INDEX(`source_article_id`)'
        ') ENGINE=InnoDB;')
    connection.close()
def pickle_correlations_zeros():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn)
    print 'group'
    article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)
Exemple #5
0
def build_page_length_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `page_length` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,'
                      ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL'
                  ') ENGINE=InnoDB;')
    connection.close()
def build_page_length_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `redirects_candidates_page_length` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,'
                      ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL'
                  ') ENGINE=InnoDB;')
    connection.close()
Exemple #7
0
def pickle_vis_data_pandas():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()

    df = pd.read_sql(
        'select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features',
        conn)
    print len(df)

    no_dup = df.sort([
        'source_article_id', 'target_y_coord_1920_1080',
        'target_x_coord_1920_1080'
    ]).groupby(["source_article_id", "target_article_id"]).first()
    print len(no_dup)

    feature = no_dup.loc[no_dup['visual_region'] == 'lead']
    print len(feature)
    feature.reset_index(inplace=True)

    feature = no_dup.loc[no_dup['visual_region'] == 'infobox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv',
                                          sep='\t',
                                          index=False)

    feature = no_dup.loc[no_dup['visual_region'] == 'navbox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv',
                                          sep='\t',
                                          index=False)

    feature = no_dup.loc[no_dup['visual_region'] == 'left-body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv',
                                          sep='\t',
                                          index=False)

    feature = no_dup.loc[no_dup['visual_region'] == 'body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id',
             'target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv',
                                          sep='\t',
                                          index=False)
def plot_degree_filtered_sql():
    print 'before select'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute(
        'SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in '
        ' (select distinct prev_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'result len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000 == 0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network,
                        vfilt=lambda v: v.out_degree() + v.in_degree() > 0)
    print 'before save'
    network.save(
        "output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz")
    print 'done'

    cursor.execute(
        'SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in '
        ' (select distinct curr_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'resutl len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000 == 0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network,
                        vfilt=lambda v: v.out_degree() + v.in_degree() > 0)
    print 'before save'
    network.save(
        "output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz")
    print 'done'
def build_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute('CREATE TABLE `table_css_class` ('
                      '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
                      '`source_article_id` BIGINT UNSIGNED NOT NULL,'
                      ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
                      'INDEX(`source_article_id`)'
                  ') ENGINE=InnoDB;')
    connection.close()
Exemple #10
0
def correlations(network_name):
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    # wikipedia  graph  structural statistics

    results = None
    try:
        results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
        results = cursor.fetchall()


    except MySQLdb.Error, e:
        print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
def build_table():
    """creates up the basic database structure
    """
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    connection = db._create_connection()
    cursor = connection.cursor()

    cursor.execute(
        'CREATE TABLE `table_css_class` ('
        '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
        '`source_article_id` BIGINT UNSIGNED NOT NULL,'
        ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
        'INDEX(`source_article_id`)'
        ') ENGINE=InnoDB;')
    connection.close()
Exemple #12
0
def pickle_correlations_zeros_january():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
    print 'loaded links'
    df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501  where link_type_derived= "internal-link";',  conn)
    print 'loaded counts'
    result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
    print 'merged counts'
    print result
    article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    article_counts['counts'].fillna(0.0, inplace=True)
    print article_counts
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
Exemple #13
0
def rbo():
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    sm = []
    try:
        cursor.execute('select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm",))
        result = cursor.fetchall()
        for row in result:
            record = {}
            record['curr_id']= row[0]
            record['counts_sum'] = row[1]
            record['curr_title'] = row[2]
            sm.append(row[0])
    except MySQLdb.Error, e:
        print e
def plot_degree_filtered_sql():
    print 'before select'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in '
                   ' (select distinct prev_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'result len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000==0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )
    print 'before save'
    network.save("output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz")
    print 'done'

    cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in '
                   ' (select distinct curr_id from clickstream_derived_internal_links);')
    result = cursor.fetchall()
    network = Graph()
    print 'after select'
    print 'resutl len'
    print len(result)

    for i, link in enumerate(result):
        if i % 1000000==0:
            print i, len(result)
        network.add_edge(link[0], link[1])

    # filter all nodes that have no edges
    print 'filter nodes with degree zero graph tool specific code'
    network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )
    print 'before save'
    network.save("output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz")
    print 'done'
Exemple #15
0
def rbo():
    print 'loading'
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    sm = []
    try:
        cursor.execute(
            'select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;',
            ("entry-sm", ))
        result = cursor.fetchall()
        for row in result:
            record = {}
            record['curr_id'] = row[0]
            record['counts_sum'] = row[1]
            record['curr_title'] = row[2]
            sm.append(row[0])
    except MySQLdb.Error, e:
        print e
Exemple #16
0
 def get_redirecsfromXML(self, dump_date):
     db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
     conn = db._create_connection()
     df = pd.read_sql(('select * from redirects'),conn)
     return df.set_index('source_article_name')['target_article_name'].to_dict()
def print_table():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    df = pd.read_sql('select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, '
                     'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, '
                     'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080', conn)

    print "dup"
    #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    no_dup = df.groupby(["source_article_id", "target_article_id"]).first()

    no_dup = no_dup.reset_index()
    print "no dup"
    del df
    #print no_dup
    df_top = pd.read_sql("select source_article_id, target_article_id, sim as topic_similarity  from topic_similarity", conn)
    print "no up"
    topDF = df_top.groupby("source_article_id", as_index=False)["topic_similarity"].median()
    #print topDF
    print "no up1"
    topDF.columns = ["source_article_id", "topic_similarity_article_median"]
    #print topDF
    print "no up2"
    df_top = df_top.merge(topDF, on="source_article_id")
    #print df_top[(df_top['topic_similarity_article_median'] >0)]
    print "no up3"

    df_sem = pd.read_sql("select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity", conn)
    print "no up4"
    semDF = df_sem.groupby("source_article_id", as_index=False)["sem_similarity"].median()
    #rename
    print "no up5"
    semDF.columns = ["source_article_id", "sem_similarity_article_median"]
    print "no up6"
    #print df_top
    df_sem = df_sem.merge(semDF, on="source_article_id")
    #print len(df_sem)
    print "no up7"
    df1 = no_dup.merge(df_sem[['source_article_id', 'sem_similarity', 'sem_similarity_article_median']], on="source_article_id")
    #print no_dup
    del df_sem, semDF
    df = no_dup.merge(df_top[['source_article_id', 'topic_similarity', 'topic_similarity_article_median']], on="source_article_id")
    print "no up9"
    del no_dup
    del df_top, topDF

    table = ""

    table += resultTableLine (df, "src_degr > target_degr", "df.rel_degree > 0")
    table += resultTableLine (df, "src_degr <= target_degr", "df.rel_degree <= 0")


    table += resultTableLine (df, "src_in_degr > target_in_degr", "df.rel_in_degree > 0")
    table += resultTableLine (df, "src_in_degr <= target_in_degr", "df.rel_in_degree <= 0")


    table += resultTableLine (df, "src_out_degr > target_out_degr", "df.rel_out_degree > 0")
    table += resultTableLine (df, "src_out_degr <= target_out_degr", "df.rel_out_degree <= 0")

    table += resultTableLine (df, "src_kcore > target_kcore", "df.rel_kcore > 0")
    table += resultTableLine (df, "src_kcore <= target_kcore", "df.rel_kcore <= 0")

    table += resultTableLine (df, "src_page_rank > target_page_rank", "df.rel_page_rank > 0")
    table += resultTableLine (df, "src_page_rank <= target_page_rank", "df.rel_page_rank <= 0")


    table += resultTableLine (df1, "text_sim > median(text_sim) of page", "df.sem_similarity > df.sem_similarity_article_median")
    table += resultTableLine (df1, "text_sim <= median(text_sim) of page", "df.sem_similarity <= df.sem_similarity_article_median")

    table += resultTableLine (df, "topic_sim > median(topic_sim) of page", "df.topic_similarity > df.topic_similarity_article_median")
    table += resultTableLine (df, "topic_sim <= median(topic_sim) of page", "df.topic_similarity <= df.topic_similarity_article_median")


    table += resultTableLine (df, "left third of screen", "df.target_x_coord_1920_1080 <= 360")
    table += resultTableLine (df, "middle third of screen", "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)")
    table += resultTableLine (df, "right third of screen", "df.target_x_coord_1920_1080 > 720")

    table += resultTableLine (df, "position = lead", "df.visual_region == 'lead'")
    table += resultTableLine (df, "position = body", "(df.visual_region == 'body') | (df.visual_region == 'left-body')")
    table += resultTableLine (df, "position = navbox", "df.visual_region == 'navbox'")
    #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'")
    table += resultTableLine (df, "position = infobox", "df.visual_region == 'infobox'")


    print table
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *

db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                   DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
cursor.execute(
    'SELECT source_article_id, target_article_id FROM link_occurences;')
result = cursor.fetchall()
wikipedia = Graph()

for link in result:
    wikipedia.add_edge(link[0], link[1])

# filter all nodes that have no edges
wikipedia = GraphView(wikipedia,
                      vfilt=lambda v: v.out_degree() + v.in_degree() > 0)

print "clust"
wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia)

print "page_rank"
wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia)

print "eigenvector_centr"
eigenvalue, eigenvectorcentr = eigenvector(wikipedia)
wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr

print "kcore"
Exemple #19
0
def weighted_pagerank():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT source_article_id, target_article_id, occ FROM link_occurences;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("int")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)




    wikipedia.save("output/weightedpagerank/wikipedianetwork_link_occ.xml.gz")
    print 'link_occ done'


    cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity group by '
                   'source_article_id, target_article_id;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("double")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    print 'filter nodes graph tool specific code'
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)


    wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz")
    print 'sem sim distrinct links done'

    cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity;')
    result = cursor.fetchall()
    wikipedia = Graph()
    eprop = wikipedia.new_edge_property("double")

    for link in result:
        e = wikipedia.add_edge(link[0], link[1])
        eprop[e] = link[2]
    # filter all nodes that have no edges
    wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )


    print "page_rank_weighted"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank_weighted"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping)

    print "page_rank"
    for damping in [0.8, 0.85, 0.9 ,0.95]:
        print damping
        key = "page_rank"+str(damping)
        wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping)

    wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim.xml.gz")
    print 'sem_sim done'
Exemple #20
0
def weighted_pagerank_hyp_engineering(labels):

    #read vocab, graph
    graph =  read_pickle(SSD_HOME+"pickle/graph")
    print "loaded graph"
    values =  read_pickle(SSD_HOME+"pickle/values")
    values_kcore = read_pickle(SSD_HOME+"pickle/values_kcore")

    # transform kcore values to model going out of the kcore
    values_kcore = [1./np.sqrt(float(x)) for x in values_kcore]
    print 'kcore values tranfsormation'

    #sem_sim_hyp = read_pickle(SSD_HOME+"pickle/sem_sim_hyp")
    #print "sem_sim_hyp values"

    #lead_hyp = read_pickle(SSD_HOME+"pickle/lead_hyp")
    #infobox_hyp = read_pickle(SSD_HOME+"pickle/infobox_hyp")
    #left_body_hyp = read_pickle(SSD_HOME+"pickle/left-body_hyp")
    #print "gamma values"

    vocab = read_pickle(SSD_HOME+"pickle/vocab")
    print "loaded vocab"

    state_count = len(vocab)
    states = vocab.keys()
    shape = (state_count, state_count)


    hyp_structural = csr_matrix((values, (graph[0], graph[1])),
                                shape=shape, dtype=np.float)


    hyp_kcore = csr_matrix((values_kcore, (graph[0], graph[1])),
                           shape=shape, dtype=np.float)
    print "hyp_kcore"

    del graph
    del values_kcore

    print "after delete"


    #read sem sim form db and create hyp
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id, sim from semantic_similarity', conn)
    print 'map sem sim'
    sem_sim_hyp_i = map_to_hyp_indicies(vocab, df['source_article_id'])
    sem_sim_hyp_j = map_to_hyp_indicies(vocab, df['target_article_id'])

    hyp_sem_sim = csr_matrix((df['sim'].values, (sem_sim_hyp_i, sem_sim_hyp_j)),
                             shape=shape, dtype=np.float)
    print 'done map sem sim'
    print hyp_sem_sim.shape
    del sem_sim_hyp_i
    del sem_sim_hyp_j
    del df

    #read vis form csv and create hyp
    lead = pd.read_csv(TMP+'lead.tsv',sep='\t')
    lead_i = map_to_hyp_indicies(vocab, lead['source_article_id'])
    lead_j = map_to_hyp_indicies(vocab, lead['target_article_id'])
    lead_v = np.ones(len(lead_i), dtype=np.float)
    
    hyp_lead = csr_matrix((lead_v, (lead_i, lead_j)),
                            shape=shape, dtype=np.float)
    print 'done map lead'
    print hyp_lead.shape
    del lead
    del lead_i
    del lead_j
    del lead_v

    infobox = pd.read_csv(TMP+'infobox.tsv',sep='\t')
    infobox_i = map_to_hyp_indicies(vocab, infobox['source_article_id'])
    infobox_j = map_to_hyp_indicies(vocab, infobox['target_article_id'])
    infobox_v = np.ones(len(infobox_i), dtype=np.float)

    hyp_infobox = csr_matrix((infobox_v, (infobox_i, infobox_j)),
                             shape=shape, dtype=np.float)
    print 'done map infobox'
    print hyp_infobox.shape
    del infobox
    del infobox_i
    del infobox_j
    del infobox_v

    left_body = pd.read_csv(TMP+'left-body.tsv',sep='\t')
    left_body_i = map_to_hyp_indicies(vocab, left_body['source_article_id'])
    left_body_j = map_to_hyp_indicies(vocab, left_body['target_article_id'])
    left_body_v = np.ones(len(left_body_i), dtype=np.float)

    hyp_left_body = csr_matrix((left_body_v, (left_body_i, left_body_j)),
                               shape=shape, dtype=np.float)
    print 'done map infobox'
    print hyp_left_body.shape
    del left_body
    del left_body_i
    del left_body_j
    del left_body_v

    #add the visual hyps to one matrix and set all non zero fields to 1.0
    print 'before gamma'
    hyp_gamma = hyp_left_body + hyp_infobox + hyp_lead
    hyp_gamma.data = np.ones_like(hyp_gamma.data, dtype=np.float)
    print 'after gamma'

    del hyp_left_body
    del hyp_infobox
    del hyp_lead

    #norm
    print "in norm each "
    hyp_structural = norm(hyp_structural)
    hyp_kcore = norm(hyp_kcore)
    hyp_sem_sim = norm(hyp_sem_sim)
    hyp_gamma = norm(hyp_gamma)

    #engineering of hypos and norm again
    hyp_kcore_struct = norm(hyp_structural + hyp_kcore)
    hyp_visual_struct = norm(hyp_structural + hyp_gamma)
    hyp_sem_sim_struct = norm(hyp_structural + hyp_sem_sim)

    hyp_mix_semsim_kcore = norm(hyp_kcore + hyp_sem_sim)
    hyp_mix_semsim_visual = norm(hyp_sem_sim + hyp_gamma)
    hyp_mix_kcore_visual= norm(hyp_kcore + hyp_gamma)


    hyp_all = norm(hyp_kcore + hyp_sem_sim + hyp_gamma)
    hyp_all_struct =  norm(hyp_kcore + hyp_sem_sim + hyp_gamma + hyp_structural)

    hyp_semsim_struct = norm(hyp_structural + hyp_kcore)

    print 'test hypos'


    hypos={}
    hypos['hyp_kcore']=hyp_kcore
    hypos['hyp_sem_sim']=hyp_sem_sim
    hypos['hyp_visual']=hyp_gamma

    hypos['hyp_kcore_struct']=hyp_kcore_struct
    hypos['hyp_visual_struct']=hyp_visual_struct
    hypos['hyp_sem_sim_struct']=hyp_sem_sim_struct

    hypos['hyp_mix_semsim_kcore']=hyp_mix_semsim_kcore
    hypos['hyp_mix_semsim_visual']=hyp_mix_semsim_visual
    hypos['hyp_mix_kcore_visual']=hyp_mix_kcore_visual

    hypos['hyp_all']=hyp_all
    hypos['hyp_all_struct']=hyp_all_struct



    #load network
    print "weighted page rank engineering"
    wikipedia = load_graph("output/wikipedianetwork.xml.gz")

    #for label, hyp in hypos.iteritems():
    name = '_'.join(labels)
    for label in labels:
        print label
        eprop = create_eprop(wikipedia,  hypos[label], vocab)
        wikipedia.edge_properties[label]=eprop
        #for damping in [0.8, 0.85, 0.9 ,0.95]:
        for damping in [0.85]:
            key = label+"_page_rank_weighted_"+str(damping)
            print key
            wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop, damping=damping)
        print 'save network'

        wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")

    print 'save network'
    wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    print 'done'
Exemple #21
0
def correlations_zeros(labels, consider_zeros=True, clickstream_data='', struct=False):
    #load network
    print struct
    name = '_'.join(labels)
    wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz")
    #read counts with zeros
    if consider_zeros:
        article_counts  =  pd.read_csv(TMP+clickstream_data+'article_counts.tsv', sep='\t')
        print TMP+clickstream_data+'article_counts.tsv'
        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                page_rank_values = list()
                counts = list()
                correlations_values = {}
                for index, row in article_counts.iterrows():
                    counts.append(float(row['counts']))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations_values['pearson']=p
                print 'spearmanr'
                s = spearmanr(page_rank_values, counts)
                print s
                correlations_values['spearmanr']=s
                print 'kendalltau'
                k = kendalltau(page_rank_values, counts)
                print k
                correlations_values['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations_values

        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_'+name+'.obj', correlations_weighted_pagerank)
    else:
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        conn = db._create_connection()
        cursor = conn.cursor()
        # wikipedia  graph  structural statistics

        results = None
        try:
            if clickstream_data != '':

                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()
            else:
                results = cursor.execute('select c.curr_id,  sum(c.counts) as counts from clickstream_derived_en_201501 c where c.link_type_derived= %s  group by c.curr_id;', ("internal-link",))
                results = cursor.fetchall()

        except MySQLdb.Error, e:
            print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
        print 'after sql load'


        correlations_weighted_pagerank = {}
        for label in labels:
            if struct:
                label = label[7:]
            for damping in [0.8,0.85,0.9]:
                key = label+"_page_rank_weighted_"+str(damping)
                pagerank = wikipedia.vertex_properties[key]
                correlations={}
                counts=[]
                page_rank_values=[]
                for row in results:
                    counts.append(float(row[1]))
                    page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))])
                print 'pearson'
                p = pearsonr(page_rank_values, counts)
                print p
                correlations['pearson']=p
                print 'spearmanr'
                s= spearmanr(page_rank_values, counts)
                print s
                correlations['spearmanr']=s
                print 'kendalltau'
                k= kendalltau(page_rank_values, counts)
                print k
                correlations['kendalltau']=k
                correlations_weighted_pagerank[key]=correlations



        write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_without_zeros'+name+'.obj', correlations_weighted_pagerank)
from wsd.database import MySQLDatabase
from graph_tool.all import *
from conf import *


db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences;')
result = cursor.fetchall()
wikipedia = Graph()

for link in result:
    wikipedia.add_edge(link[0], link[1])

# filter all nodes that have no edges
wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 )

print "clust"
wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia)

print "page_rank"
wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia)

print "eigenvector_centr"
eigenvalue, eigenvectorcentr = eigenvector(wikipedia)
wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr

print "kcore"
wikipedia.vertex_properties["kcore"] = kcore_decomposition(wikipedia)
def print_table():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                       DATABASE_NAME)
    conn = db._create_connection()

    df = pd.read_sql(
        'select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, '
        'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, '
        'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080',
        conn)

    print "dup"
    #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    no_dup = df.groupby(["source_article_id", "target_article_id"]).first()

    no_dup = no_dup.reset_index()
    print "no dup"
    del df
    #print no_dup
    df_top = pd.read_sql(
        "select source_article_id, target_article_id, sim as topic_similarity  from topic_similarity",
        conn)
    print "no up"
    topDF = df_top.groupby("source_article_id",
                           as_index=False)["topic_similarity"].median()
    #print topDF
    print "no up1"
    topDF.columns = ["source_article_id", "topic_similarity_article_median"]
    #print topDF
    print "no up2"
    df_top = df_top.merge(topDF, on="source_article_id")
    #print df_top[(df_top['topic_similarity_article_median'] >0)]
    print "no up3"

    df_sem = pd.read_sql(
        "select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity",
        conn)
    print "no up4"
    semDF = df_sem.groupby("source_article_id",
                           as_index=False)["sem_similarity"].median()
    #rename
    print "no up5"
    semDF.columns = ["source_article_id", "sem_similarity_article_median"]
    print "no up6"
    #print df_top
    df_sem = df_sem.merge(semDF, on="source_article_id")
    #print len(df_sem)
    print "no up7"
    df1 = no_dup.merge(df_sem[[
        'source_article_id', 'sem_similarity', 'sem_similarity_article_median'
    ]],
                       on="source_article_id")
    #print no_dup
    del df_sem, semDF
    df = no_dup.merge(df_top[[
        'source_article_id', 'topic_similarity',
        'topic_similarity_article_median'
    ]],
                      on="source_article_id")
    print "no up9"
    del no_dup
    del df_top, topDF

    table = ""

    table += resultTableLine(df, "src_degr > target_degr", "df.rel_degree > 0")
    table += resultTableLine(df, "src_degr <= target_degr",
                             "df.rel_degree <= 0")

    table += resultTableLine(df, "src_in_degr > target_in_degr",
                             "df.rel_in_degree > 0")
    table += resultTableLine(df, "src_in_degr <= target_in_degr",
                             "df.rel_in_degree <= 0")

    table += resultTableLine(df, "src_out_degr > target_out_degr",
                             "df.rel_out_degree > 0")
    table += resultTableLine(df, "src_out_degr <= target_out_degr",
                             "df.rel_out_degree <= 0")

    table += resultTableLine(df, "src_kcore > target_kcore",
                             "df.rel_kcore > 0")
    table += resultTableLine(df, "src_kcore <= target_kcore",
                             "df.rel_kcore <= 0")

    table += resultTableLine(df, "src_page_rank > target_page_rank",
                             "df.rel_page_rank > 0")
    table += resultTableLine(df, "src_page_rank <= target_page_rank",
                             "df.rel_page_rank <= 0")

    table += resultTableLine(
        df1, "text_sim > median(text_sim) of page",
        "df.sem_similarity > df.sem_similarity_article_median")
    table += resultTableLine(
        df1, "text_sim <= median(text_sim) of page",
        "df.sem_similarity <= df.sem_similarity_article_median")

    table += resultTableLine(
        df, "topic_sim > median(topic_sim) of page",
        "df.topic_similarity > df.topic_similarity_article_median")
    table += resultTableLine(
        df, "topic_sim <= median(topic_sim) of page",
        "df.topic_similarity <= df.topic_similarity_article_median")

    table += resultTableLine(df, "left third of screen",
                             "df.target_x_coord_1920_1080 <= 360")
    table += resultTableLine(
        df, "middle third of screen",
        "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)"
    )
    table += resultTableLine(df, "right third of screen",
                             "df.target_x_coord_1920_1080 > 720")

    table += resultTableLine(df, "position = lead",
                             "df.visual_region == 'lead'")
    table += resultTableLine(
        df, "position = body",
        "(df.visual_region == 'body') | (df.visual_region == 'left-body')")
    table += resultTableLine(df, "position = navbox",
                             "df.visual_region == 'navbox'")
    #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'")
    table += resultTableLine(df, "position = infobox",
                             "df.visual_region == 'infobox'")

    print table