def create_pagerank_features():
    """
    We create a dataframe with some features extracted from the PageRank
    algorithm
    :return: pandas dataframe for train and test set
    """
    # Load dataset
    df_train, df_test = load_dataset()

    def generate_qid_graph_table(row):
        """
        Generating a graph of questions and their neighbors.
        Appending nodes to the graph directly
        :param row: dataframe row
        """
        hash_key1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest()
        hash_key2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest()

        qid_graph.setdefault(hash_key1, []).append(hash_key2)
        qid_graph.setdefault(hash_key2, []).append(hash_key1)

    qid_graph = {}
    _ = df_train.apply(generate_qid_graph_table, axis=1)
    _ = df_test.apply(generate_qid_graph_table, axis=1)
    pagerank_dict = get_pagerank(qid_graph)

    X_train = df_train.apply(lambda x: get_pagerank_value(x, pagerank_dict),
                             axis=1)
    # Empty garbage collector
    del df_train
    gc.collect()
    X_test = df_test.apply(lambda x: get_pagerank_value(x, pagerank_dict),
                           axis=1)
    return X_train, X_test
Beispiel #2
0
    def process_item(self, item, spider):
        if not isinstance(item, TaseItem):
            return item
        if item['url']:
            rank = pagerank.get_pagerank(item['url'])
            try:
                pagerank = float(rank)

                self.cur.execute(\
                    "insert into pagerank (sessionid, date_, symbol, pagerank) "
                    "values (%s, %s, %s, %s)",
                    (
                    global_time,
                    global_date.isoformat(),
                    item['symbol'],
                    pagerank
                    )
                )
            except ValueError:
                pass
            except MySQLdb.IntegrityError, e:
                #print 'SQL integrity error: %s' % e
                log.msg('SQL integrity error: %s' % e)
Beispiel #3
0
 def execute_lookup(key):
     rank = pagerank.get_pagerank(key)
     if rank is None:
         logger.debug('Error looking up pagerank for %s'%(key))
         return ''
     return rank
#!/usr/bin/env python
#  Thanks to Corey Goldberg (http://code.google.com/p/corey-projects/) for his work on pagerank.py

import pagerank

sourceFile = open ('urlList.txt', 'r')
listDump = open ('PRdump.txt', 'w')

for line in sourceFile:
  rank = pagerank.get_pagerank(line)
	print >>listDump, rank, "\n",
Beispiel #5
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import pagerank

print pagerank.get_pagerank("http://www.fsf.org")
Beispiel #6
0
def printSortedPageRankList( pages ):
        for page in sorted(pages ):
                url = pages_dict[page]
                rank = pagerank.get_pagerank(url)
                print page + "'s pagerank is: " + rank
Beispiel #7
0
def get_pageranks(urls=list()):
    prs = {}
    for url in urls: 
        print "Processing url: %s" % url
        prs[url] = pagerank.get_pagerank(url)
    return prs
Beispiel #8
0
#!/usr/bin/env python

import pagerank

rank = pagerank.get_pagerank('http://www.google.com')
print 'google:', rank
print "baidu:", pagerank.get_pagerank('http://www.baidu.com')
print pagerank.get_pagerank('http://www.csdn.net')
print pagerank.get_pagerank('http://www.codeproject.com')
import csv
import pprint

import pagerank

f = csv.writer(open('articles_pr_2.csv', 'wb'))
for item in csv.reader(open('articles.csv')):
    rank = pagerank.get_pagerank(item[1])
    item.append(rank)
    f.writerow(item)

    print item
Beispiel #10
0
#!/usr/bin/env python

import pagerank

rank = pagerank.get_pagerank('http://www.google.com')
print 'google:',rank
print "baidu:", pagerank.get_pagerank('http://www.baidu.com')
print pagerank.get_pagerank('http://www.csdn.net')
print pagerank.get_pagerank('http://www.codeproject.com')