コード例 #1
0
 def run_pagerank(self, damping_factor=0.85, iterations=10):
     return pagerank(self.index_file,
                     self.edge_file,
                     self.max_node,
                     self.edge_count,
                     damping_factor=damping_factor,
                     iterations=iterations)
コード例 #2
0
 def test_pagerank(self):
     size = 1000
     g = nx.DiGraph(nx.powerlaw_cluster_graph(size, 3, 0.001))
     N = len(g.nodes())
     tmp_file = tempfile.NamedTemporaryFile(delete=False)
     for node in g.nodes():
         outlinks = g.out_edges(nbunch=[node])
         outlinks = map(str, [n2 for n1, n2 in outlinks])
         if not outlinks:
             value = 'pr_results,%s,%s' % (1.0/N, N)
             tmp_file.write('%s\t%s\n' % (node, value))
         else:
             outlinks_str = ','.join(outlinks)
             value = 'pr_results,%s,%s,' % (1.0/N, N)
             value += outlinks_str
             tmp_file.write('%s\t%s\n' % (node, value))
     tmp_file.flush()
     input_path = tmp_file.name
     job_id = 'unittest'
     sorted_ids = pagerank(job_id, self.iter_count, input_path, self.top_n)
     fs = HadoopFS()
     fs.rmr('%s/hat_results' % job_id)
     if self.top_n <= size: 
         self.assertEqual(len(sorted_ids), self.top_n, 'some ids is missing')
     id_ranges = range(0, 1000)
     for _id in sorted_ids:
         self.assertIn(int(_id), id_ranges, 'node should in graph')
コード例 #3
0
def netrank_gridsearch(network_path, diff_expr, out_path, alpha_prec=11):
    """Perform grid search over alpha parameter.

    This function will compute the netranks for a given network and differential
    expression data for a range of alpha parameters.

    Parameters:
    ----------
    network_path:           Path to the network in hdf5 container (with gene names).
                            Gene names and expression data are assumed to be in
                            the same order as the nodes in the adjacency matrix.

    diff_expr:              Differential expression dataframe. If set to None,
                            PageRank is calculated, otherwise NetRank will be used.

    out_path:               Directory to which the results are written.

    alpha_prec:             The number of runs to compute netrank scores.
                            Default is 10, which corresponds to computing ranks
                            for alpha = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1].
    """
    alpha_range = np.linspace(0, 1, alpha_prec)
    print("Running grid search for alpha={}".format(alpha_range))
    for alpha in alpha_range:
        scores, gene_names = pagerank.pagerank(network_path, diff_expr, alpha)
        out = os.path.join(out_path, 'netrank_alpha_{}.txt'.format(alpha))
        pagerank.write_ranking(scores, gene_names, out)
        print("Netrank for alpha {} computed successfully!".format(alpha))
    print("Grid Search successfully computed (results in {})".format(out_path))
コード例 #4
0
	def test_simple(self):
		graph_dict = {"A": {},
					 "B": {"A":{}, "C":{}},
					 "C": {"A": {}},
					 "D": {"A":{}, "B":{}, "C":{}}
					 }	
		graph = Graph(graph_dict)
		ranks = pagerank(graph, iterations=10)
		math.isclose(ranks["A"], 0.4583333)
コード例 #5
0
def main():

    sentences = ["i am paradox", "my name is paradox", "i love caffeine"]
    sentences_tokenized = list(map(lambda x: x.split(), sentences))
    print(sentences_tokenized)
    matrix = build_sim_matrix(sentences_tokenized)
    print(matrix)
    result = pagerank(matrix)
    print(result)
コード例 #6
0
def undirected_page_rank(q, D, p, sim, th, priors):
    results = {}
    doc_ids = q['visited_documents']
    sim_graph = build_graph(cl.get_subset(D, doc_ids), sim, th)

    if priors == 'baseline':    # if baseline tries priors regarding 1st delivery
        priors_vec = {doc_id: 1/(50 + rank) for doc_id,rank in q['visited_documents_orders'].items()}
    elif priors == 'classification': # if classification tries priors regarding 2nd delivery
        priors_vec = q['document_probabilities']

    pr_values = {'vanilla_pk': pk.pagerank(sim_graph, max_iter=50, weight=None),
                'extended_pk': pk.pagerank(sim_graph, max_iter=50, weight='weight',
                                            personalization=priors_vec)}

    for pr_type in pr_values:
        results[pr_type] = cl.get_subset(pr_values[pr_type],
                                         sorted(list(pr_values[pr_type].keys()), key=lambda x: pr_values[pr_type][x],
                                                reverse=True)[:p])
    return results
コード例 #7
0
ファイル: nspagerank.py プロジェクト: cburschka/nationstates
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('damping', nargs='?', help='The damping factor to use.', type=int, default=0.85)
    args = parser.parse_args()
    graph = endograph.endograph(sys.stdin.read())
    scores = zip(sorted(graph.nations), pagerank.pagerank(graph.matrix(), args.damping))
    print("\n".join(
        "%.10f %s" % (b,a) for (a,b) in 
        sorted(scores, key=lambda z:z[1], reverse=True)
        ))
コード例 #8
0
	def test_with_iter_and_dampening(self):
		graph_dict = {"A": {},
					 "B": {"A":{}, "C":{}},
					 "C": {"A": {}},
					 "D": {"A":{}, "B":{}, "C":{}}
					 }			
		graph = Graph(graph_dict)
		ranks = pagerank(graph, iterations=0, d=1)
		math.isclose(ranks["A"], 0.50747)		
		math.isclose(ranks["B"], 0.27431)
		math.isclose(ranks["C"], 0.1925)
		math.isclose(ranks["D"], 0.15)
コード例 #9
0
def pagerank_aggregator(objects, threshold, alpha):
    """Implements the pagerank aggregation for a given alpha and epsilon.
    Alpha is for the bias towards surf probability, non-random in this case.
    Epsilon controls the convergence threshold, a small number in practice.

    The program first constructs the graph given the rankers and then calls
    pagerank function repeatedly until the average change in pagerank scores 
    is below epsion. Then, it converts the resulting scores into a ranking.

    """

    ### Construct graph version of the rankers and update indegrees
    indegrees = {}
    total_indegrees = 0.0
    graph = {}
    for key in objects.keys():
        graph[key] = []
        indegrees[key] = 0.0

    for key1 in objects.keys():
        for key2 in objects.keys():
            if key1 != key2:
                count = num_higher(objects, key1, key2) 
                if count > 0:
                    graph[key1].append( (key2,float(count)) )
                    indegrees[key2] += count
                    total_indegrees += count


    ##Normalize to add the outlinks to 1
    for key1 in graph.keys():
        total = 0
        for (key2,val) in graph[key1]:
            total += val
        for i in range(len(graph[key1])):
            (key2, val) = graph[key1][i]
            graph[key1][i] = (key2, val/total)

    ##Normalize indegrees as well
    for key in indegrees.keys():
        indegrees[key] /= total_indegrees


    ### Call page rank
    final_scores = pg.pagerank(graph, indegrees, threshold, alpha)

    #final_scores = pg.pagerank(graph, {}, threshold, alpha)

    ### Convert the final scores to a ranking
    ranker = get_ranker_for_scores(final_scores)

    rankscore = kendall_tau(objects, ranker)
    return ranker, rankscore
コード例 #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('damping',
                        nargs='?',
                        help='The damping factor to use.',
                        type=int,
                        default=0.85)
    args = parser.parse_args()
    graph = endograph.endograph(sys.stdin.read())
    scores = zip(sorted(graph.nations),
                 pagerank.pagerank(graph.matrix(), args.damping))
    print("\n".join(
        "%.10f %s" % (b, a)
        for (a, b) in sorted(scores, key=lambda z: z[1], reverse=True)))
コード例 #11
0
ファイル: pagerank_test.py プロジェクト: huxiaoqian/project
def pagerank_simluation_test(input_path):
    iter_count = 5
    top_n = 500
    job_id = 1
    sorted_ids = pagerank(job_id, iter_count, input_path, top_n)
    if sorted_ids:
        if len(sorted_ids) < 10:
            for i in range(len(sorted_ids)):
                print sorted_ids[i]
        else:
            for i in range(10):
                print sorted_ids[i]
    fs = HadoopFS()
    fs.rmr('%s/hat_results' % job_id)
コード例 #12
0
ファイル: TextRank.py プロジェクト: ReganBell/QReview
def find_key_phrases(tokens, parts_of_speech, window):

    tagged_tokens = nltk.pos_tag(tokens)

    nodes = []
    for token in tokens:
         nodes.append((token, 1))

    edges = []
    for i in range(0, len(tagged_tokens)):
        if tagged_tokens[i][1] in parts_of_speech:
            right = min(i + window, len(tagged_tokens))
            for j in range(i+1, right):
                vertex = (tokens[i], tokens[j], {"weight": 1})
                edges += [vertex]
    return pagerank(nodes, edges, 15)
コード例 #13
0
ファイル: area.py プロジェクト: huxiaoqian/project
def pagerank_rank(top_n, date, topic_id, window_size):
    data = []

    tmp_file = prepare_data_for_pr(topic_id, date, window_size)

    if not tmp_file:
        return data

    input_tmp_path = tmp_file.name
    
    
    job_id = generate_job_id(datetime2ts(date), window_size, topic_id)
    iter_count = PAGERANK_ITER_MAX

    sorted_uids = pagerank(job_id, iter_count, input_tmp_path, top_n)

    print sorted_uids
コード例 #14
0
ファイル: GraphSim.py プロジェクト: sethupathib/MAGICAL
 def pagerankCenter(graph):
     if len(graph.nodes) == 1:
         return graph.nodes[0], 0
     simMatrix = pagerank(graph, graph.nodes)
     sumDist = dict()
     minDist = float("inf")
     centerList = list()
     center = None
     for node in graph.nodes:
         sumDist[node] = sum(list(simMatrix[node].values()))
         if minDist > sumDist[node]:
             centerList = [node]
             minDist = sumDist[node]
         elif minDist == sumDist[node]:
             centerList.append(node)
     center = centerList[0]
     radius = nx.eccentricity(graph, v=center)
     return center, radius
コード例 #15
0
def main():
    parser = argparse.ArgumentParser(description='Running PageRank')
    parser.add_argument('path_file',
                        metavar='f',
                        type=str,
                        help='path to file to perform page ranking')
    parser.add_argument('beta',
                        metavar='b',
                        type=float,
                        help='allow teleportation with 1-beta probability')
    parser.add_argument('eps',
                        metavar='eps',
                        type=float,
                        help='epsilon value for convergence')
    parser.add_argument('n_nodes',
                        metavar='nodes',
                        type=int,
                        help='number of unique nodes')
    parser.add_argument('n_edges',
                        metavar='edges',
                        type=int,
                        help='number of all edges')
    parser.add_argument('pow_iter',
                        metavar='pow_iter',
                        type=int,
                        help='number of iterations')

    args = parser.parse_args()
    file_path = args.path_file
    beta = args.beta
    eps = args.eps
    n_nodes = args.n_nodes
    n_edges = args.n_edges
    num_iterations = args.pow_iter

    result = pagerank(file_name=file_path,
                      beta=beta,
                      power_iterations=num_iterations,
                      num_edges=n_edges,
                      num_nodes=n_nodes,
                      eps=eps)
コード例 #16
0
ファイル: area.py プロジェクト: huxiaoqian/project
def pagerank_rank(top_n, date, topic_id, window_size):
    data = []

    tmp_file = prepare_data_for_pr(topic_id, date, window_size)

    if not tmp_file:
        return data

    input_tmp_path = tmp_file.name
    
    job_id = generate_job_id(datetime2ts(date), window_size, topic_id)
    iter_count = PAGERANK_ITER_MAX

    sorted_uids = pagerank(job_id, iter_count, input_tmp_path, top_n)

    topicname = acquire_topic_name(topic_id)
    if not topicname:
        return data

    data = save_rank_results(sorted_uids, 'topic', 'pagerank', date, window_size, topicname)

    return data
コード例 #17
0
    for row in reader:
        all_realted_paper_li.append(row[0])
print "all_realted_paper_li create over"
print "len(all_realted_paper_li):", len(all_realted_paper_li)

# 读取作者前五年论文
paper5_set = set()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_realted_paper_li:
            paper5_set.add(row[0])
print "paper5_set and paper5_li create over"
print "len(paper5_set):", len(paper5_set)

with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in paper5_set:
            graph.add_node(row[0])
            cite_paper_li = row[7][2: -2].split('\', \'')
            for cite_paper in cite_paper_li:
                if cite_paper in all_realted_paper_li:
                    graph.add_edge(row[0], cite_paper)
print "graph create over"
print graph.number_of_nodes()
print graph.number_of_edges()

rank_value = pagerank.pagerank(graph, max_iteration=1000)
pagerank.save_pagerank_value(rank_value, r'..\result\paper_unweighted_pagerank.csv')
コード例 #18
0
def crawler():
    try:
        conn = sqlite3.connect(db_file)
        conn.text_factory = str
        pool = threadpool.ThreadPool(16)

        # Init keywords
        with open(word_file, "rb") as f:
            keywords = pickle.load(f)
        with open(searched_word_file, "rb") as f:
            searched = pickle.load(f)

        while True:
            keywords_list = list(keywords)
            while len(keywords_list) != 0:
                init_vals = []
                for k in keywords_list[:key_batch_size]:
                    for site in search_sites:
                        for search_page in site['search_page']:
                            init_vals.append(([
                                site['domain'], search_page, site['header'], k
                            ], None))

                threqs = threadpool.makeRequests(proc_search, init_vals)
                [pool.putRequest(req) for req in threqs]
                pool.wait()

                print("Commiting...")

                cursor = conn.cursor()

                for cmd in webpage_cmds:
                    try:
                        cursor.execute(cmd[0], cmd[1])
                    except:
                        pass

                for cmd in word_cmds:
                    try:
                        cursor.execute(cmd[0], cmd[1])
                    except:
                        pass

                for cmd in ref_cmds:
                    try:
                        cursor.execute(cmd[0], cmd[1])
                    except:
                        pass

                cursor.close()
                conn.commit()

                webpage_cmds.clear()
                ref_cmds.clear()
                word_cmds.clear()

                print("Ranking and indexing...")
                pagerank()
                index()

                print("Saving keywords...")

                for k in keywords_list[:key_batch_size]:
                    searched.add(k)

                keywords_list = keywords_list[key_batch_size:]

                with open(word_file, "wb") as f:
                    pickle.dump(set(keywords_list), f)

                with open(searched_word_file, "wb") as f:
                    pickle.dump(searched, f)

                print("Done.")

            keywords = new_key.copy()
            new_key.clear()

            print("Iterator Done.")

    except Exception as e:
        print(e)
    finally:
        conn.close()
コード例 #19
0
''' Run PageRank algorithm on the moon landing webgraph '''
from graph import adjacency_list
from pagerank import pagerank
from results import output_rank_json, output_rank_csv, output_random_walks
from randomwalks import random_walks

# Parse dataset into adjacency matrix
adj_list = adjacency_list("data/adj_list")
# Non scaled pagerank
rank = pagerank(adj_list, 10)

# output results of basic pagerank
output_rank_json("data/nodes", rank, "out/unscaledranking.json")
output_rank_csv("data/nodes", rank, "out/unscaledranking.csv")


# Scaled pagerank s = 0.85
scaled_rank = pagerank(adj_list, 10, scaled=True)

# output results of scaled pagerank
output_rank_json("data/nodes", scaled_rank, "out/scaledranking.json")
output_rank_csv("data/nodes", scaled_rank, "out/scaledranking.csv")

# Scaled pagerank s = 0.7
scaled_rank = pagerank(adj_list, 10, scaled=True, s=0.7)
output_rank_csv("data/nodes", scaled_rank, "out/scaledranking7.csv")

# Scaled pagerank s = 0.5
scaled_rank = pagerank(adj_list, 10, scaled=True, s=0.5)
output_rank_csv("data/nodes", scaled_rank, "out/scaledranking5.csv")
コード例 #20
0
def epidemic_propagation(
        adjencyMatrix,
        scenario="noVaccination",
        initContamination=0.05,
        initVaccination=0.12,
        contaminationRate=0.2,
        cureRate=0.26,
        alpha=0.85,
        epsilon=0.00001):
    """
    Computes the propagation of an epidemy

    Args:
        adjencyMatrix: The adjency matrix of the graph
        scenario: The scenario to execute (noVaccination, randomVaccination, specificVaccination)
        initContamination: X% of the initially infected people
        initVaccination: Y% of the initially vaccinated people
        contaminationRate: % of chance to contamine a neighbour
        cureRate: Self cure rate
        alpha: Dumping factor

    Returns:

    """
    size = adjencyMatrix.shape[0]

    # Vaccinate people
    if scenario == "PRVaccination":  # PageRank vaccination (Most important nodes are vaccinate)
        vaccinated_nodes = pagerank(adjencyMatrix, alpha, epsilon)[: int(size * initVaccination)]
    elif scenario == "randomVaccination":  # Random vaccination
        vaccinated_nodes = np.random.choice(size, int(size * initVaccination), replace=False)
    else:  # No vaccination
        vaccinated_nodes = []

    adjencyMatrix = np.delete(adjencyMatrix, vaccinated_nodes, 0)
    adjencyMatrix = np.delete(adjencyMatrix, vaccinated_nodes, 1)

    # Select randomly nodes that are initially infected
    size = adjencyMatrix.shape[0]
    contam_vector = np.random.choice(size, int(size * initContamination))

    # Infection ratio is the parameter given
    res = [contaminationRate]

    # Iterations
    for i in range(200):
        # Get the neighbours of infected nodes
        neighbours = adjencyMatrix[contam_vector, :].nonzero()[1]
        # Infect neighbour (contaminationRate probability)
        contam_vector = np.unique(
            np.concatenate(
                (
                    contam_vector,
                    np.extract(np.random.rand(neighbours.shape[0]) < contaminationRate, neighbours),
                )
            )
        )

        # Infect non-neighbour (1-alpha probability)
        non_neighbours = np.array(list(set(np.arange(0, size)) - set(contam_vector) - set(neighbours)))
        if True and len(non_neighbours) > 0:
            contam_vector = np.unique(
                 np.concatenate(
                    (
                        contam_vector,
                        np.extract(
                            np.random.rand(non_neighbours.shape[0]) < (1 - alpha) / non_neighbours.shape[0], non_neighbours
                        ),
                    )
                )
            )

        # Cure
        contam_vector = np.delete(
            contam_vector, np.where(np.random.rand(contam_vector.shape[0]) < cureRate)[0]
        )

        # Get the number of infected individuals
        res.append(float(len(contam_vector)))

    print("Results : ", list(enumerate(res)))
    return zip(*list(enumerate(res)))
コード例 #21
0
        dataFile = open(self.data_file)
        for line in open(self.data_file):
            A=line.strip().split('\t')[0]
            B=line.strip().split('\t')[1]
            self.List.append(A)
            self.List.append(B)
        dataFile.close()

if __name__ == '__main__':

    PRdata = PRdata("data/WikiData.txt")
    dg = Graph.Graph()
    dg.add_nodes(PRdata.List)
    with open("data/WikiData.txt","r") as f:
         for line in f:
            A=line.strip().split('\t')[0]
            B=line.strip().split('\t')[1]
            dg.add_edge((A, B))
    f.close()
    #print(dg.node_n)
    pr = pagerank.pagerank(dg)
    page_ranks = pr.page_rank()

    print("The final page rank is saving...")
    with open("results/result.txt","w") as f_w:
        for key, value in page_ranks.items():
            #print(key + " : ", value[0])
            f_w.write(key + '\t\t' + str(value[0]) + '\n')
    f_w.close()
    print("The final page rank is okey.")
input('请按任意键退出...')
コード例 #22
0
ファイル: test_pagerank.py プロジェクト: shaform/pagerank
                        nargs=1,
                        help='input file')
    args = parser.parse_args()

    print('executing with damping factor {}, epsilon {}'.format(
        args.d, args.e))

    # read the file

    print('constructing graph...')

    g = load_graph(args.ifile[0])

    print('{} nodes loaded.'.format(g.nNode))

    #

    print('start pageranking...')

    startTime = time.perf_counter()
    pg = pagerank.pagerank(g, args.d, args.e)
    endTime = time.perf_counter()

    print('Time spent: {}s'.format(endTime - startTime))

    #

    print('store result...')

    save_pagerank(pg, args.o)
コード例 #23
0
def textrank(sentences_tokenized, topn=5):
    matrix = build_sim_matrix(sentences_tokenized)
    ranks = pagerank(matrix)
    return matrix, ranks
コード例 #24
0
ファイル: test_pagerank.py プロジェクト: shaform/pagerank
    parser.add_argument('ifile', metavar='input-file', nargs=1,
            help='input file')
    args = parser.parse_args()

    print('executing with damping factor {}, epsilon {}'.format(
        args.d, args.e))

    # read the file

    print('constructing graph...')

    g = load_graph(args.ifile[0])

    print('{} nodes loaded.'.format(g.nNode))

    #

    print('start pageranking...')

    startTime = time.perf_counter()
    pg = pagerank.pagerank(g, args.d, args.e)
    endTime = time.perf_counter()

    print('Time spent: {}s'.format(endTime-startTime))

    #

    print('store result...')

    save_pagerank(pg, args.o)
コード例 #25
0
ファイル: main.py プロジェクト: Balding-Lee/pagerank
import graph_generate as gg
import matrix_generate as mg
import pagerank as pr

if __name__ == '__main__':
    dg = gg.create_network()
    adj_matrix = mg.create_matrix(dg)
    pr_vec = mg.create_pr_vector(dg)
    pr.pagerank(adj_matrix, pr_vec)
コード例 #26
0
        lambda x: (x['page_id'], x['title'], x['url'], x['content']))

    (documents_rdd.distinct().map(lambda (p, t, u, c): {
        "page_id": p,
        "title": t,
        "url": u,
        "content": c
    }).map(lambda x: json.dumps(x)).saveAsTextFile(DOCUMENTS_SAVE_PATH))

    print 'start get pagerank'
    pageID_rdd = barrel_rdd.map(lambda x: x['page_id'])
    title_to_pageID = (barrel_rdd.map(
        lambda x: (x['title'], x['page_id'])).distinct().collectAsMap())
    links_rdd = (load_local.pagelinksToDataframe(sc).rdd.map(
        lambda x: (x.pl_from, title_to_pageID.get(x.pl_title, -1))))
    ranks_rdd = pagerank.pagerank(pageID_rdd, links_rdd, 5)

    print 'start get tf'
    tf_rdd = (barrel_rdd.flatMap(lambda x: x['words_with_meta']).map(
        lambda (word, (page_id, is_title, tf)): (word_to_wordID[word],
                                                 (page_id, tf))).cache())

    print 'start get df'
    df_rdd = (barrel_rdd.flatMap(lambda x: x['unique_words']).map(
        lambda word: (word_to_wordID[word], 1)).reduceByKey(add).cache())

    print 'start get tfidf'
    tfidf_rdd = (
        tf_rdd.join(df_rdd).map(lambda (word_id, ((page_id, tf), df)): (
            (word_id, page_id), round(tf / float(df), 5))).cache())
コード例 #27
0
        all_realted_paper_li.append(row[0])
print "all_realted_paper_li create over"
print "len(all_realted_paper_li):", len(all_realted_paper_li)

# 读取作者前五年论文
paper5_set = set()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_realted_paper_li:
            paper5_set.add(row[0])
print "paper5_set and paper5_li create over"
print "len(paper5_set):", len(paper5_set)

with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in paper5_set:
            graph.add_node(row[0])
            cite_paper_li = row[7][2:-2].split('\', \'')
            for cite_paper in cite_paper_li:
                if cite_paper in all_realted_paper_li:
                    graph.add_edge(row[0], cite_paper)
print "graph create over"
print graph.number_of_nodes()
print graph.number_of_edges()

rank_value = pagerank.pagerank(graph, max_iteration=1000)
pagerank.save_pagerank_value(rank_value,
                             r'..\result\paper_unweighted_pagerank.csv')
コード例 #28
0
 def pagerank(self):
     return pagerank(self.tomatrix())
コード例 #29
0
    parsed_page_rdd = sc.textFile(load.PARSED_PAGE_PATH).map(util.encode)
    parsed_page_rdd.cache()
    docid_words = parsed_page_rdd.keys() \
                                  .map(lambda x: (x['url'], x['words'])) \
                                  .join(url_docid) \
                                  .map(lambda (url, (words, doc_id)): (doc_id, words))
    links_rdd = parsed_page_rdd.values().flatMap(lambda x: x)

    word_word_id = sc.textFile(load.WORD_LIST_PATH).map(
        util.encode).map(lambda x: (x['word'], x['word_id']))

    print 'start links'

    print 'start get pagerank'

    ranks_rdd = pagerank.pagerank(doc_id_rdd, links_rdd, 5)

    # Create tf idf and inverted index
    MINIMUM_VALID_DF = 10
    words_with_meta = docid_words.join(ranks_rdd) \
                                 .flatMap(lambda (doc_id, (words, rank)): [(word, (doc_id, meta, rank)) for (word, meta) in words]) \
                                 .join(word_word_id) \
                                 .map(lambda (word, ((doc_id, meta, rank), word_id)): ((word_id, word), (doc_id, meta, rank))) \
                                 .groupByKey() \
                                 .map(lambda (word_with_id, docs): (word_with_id, docs, len(docs))) \
                                 .filter(lambda (word_with_id, docs, df): df > MINIMUM_VALID_DF)
    words_with_meta.cache()

    words_with_meta.map(lambda ((word_id, word), docs, df): (word_id, [(doc_id, h, s, t, round(tf * df, 5), rank) for (doc_id, (h, s, t, tf), rank) in docs])) \
                   .map(lambda (word_id, docs): (word_id, [{"doc_id": doc_id, "tfidf": tfidf, "header": h, "style": s, "title": t, "pagerank": r} for (doc_id, h, s, t, tfidf, r) in docs])) \
                   .flatMap(lambda (word_id, docs): [(word_id, i, sub_docs) for i, sub_docs in enumerate(list(chunked(docs, int(math.ceil(len(docs)/float(DIVIDE_INDEX))))))]) \
コード例 #30
0
ファイル: binary.py プロジェクト: Sulemanovaaa/info_search
import csv
from lematization import lemm_str
from pagerank import pagerank
import operator

def search(query: str, index_csv_path: str) -> [str]:
    lemms = lemm_str(query)
    doc_lists = []

    with open(index_csv_path) as f:
        reader = csv.DictReader(f, delimiter=',')
        for line in reader:
            for word in lemms:
                doc_lists.append(ast.literal_eval(line.get(word, '[]')))

    result = set()
    if len(doc_lists) > 0:
        result.update(set(doc_lists[0]))

    for docs in doc_lists:
        result.intersection_update(docs)
    return list(result)

if __name__ == "__main__":
    query = input("Enter your request:")
    answers = search(query, "./inverted_index.csv")
    for item in pagerank(answers):
        print(item)


コード例 #31
0
all_paper_list = list()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        all_paper_list.append(row[0])


graph = nx.DiGraph()
with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_paper_list:
            if len(row) == 8:
                cite_paper_str = row[7]
                if cite_paper_str == '[]':
                    graph.add_node(row[0])
                else:
                    cite_paper_li = cite_paper_str[2: -2].split('\', \'')
                    for cite_paper in cite_paper_li:
                        graph.add_edge(cite_paper, row[0])
            else:
                graph.add_node(row[0])
print "graph create over"


rank_value_dic = pagerank.pagerank(graph, 1000)

file_path = r'..\result\paper_pagerank_value.csv'
pagerank.save_pagerank_value(rank_value_dic, file_path)
コード例 #32
0
from graph import Node, Graph
from pagerank import pagerank

a = Node()
b = Node()
c = Node()
d = Node()

g = Graph()

g.add_node('a', a)
g.add_node('b', b)
g.add_node('c', c)
g.add_node('d', d)

g.add_edge('b', 'c')
g.add_edge('b', 'a')
g.add_edge('c', 'a')
g.add_edge('d', 'a')
g.add_edge('d', 'b')
g.add_edge('d', 'c')

ranks = pagerank(g)

for node, value in ranks.items():
    print(node.name, value)
コード例 #33
0
        for line in f:
            line = line.strip().split(" ")
            n_0 = int(line[0])
            path_count = int(line[-1])
            start_nodes[n_0] += path_count
            total_number_of_items_manufactured += path_count

    start_nodes = {k:v/total_number_of_items_manufactured for k,v in start_nodes.items()}


    # CALCULATE PAGERANK FOR DIFFERENT CONDITIONS
    # ===========================================
    # Case 1: Original algorithm
    E = {k:1 for k,v in edges.items()}
    start_nodes_all = {n:1/len(G.nodes) for n in G.nodes}
    PR_1 = pagerank.pagerank(E, list(G.nodes), start_nodes_all, B=0.85)

    # Case 2: Edge weights considered
    PR_2 = pagerank.pagerank(edges, list(G.nodes), start_nodes_all, B=0.85)

    # Case 3: Edge weights and start nodes considered
    PR_3 = pagerank.pagerank(edges, list(G.nodes), start_nodes, B=0.85)

    # Print table for LaTeX
    for k,v in PR_1.items():
        print(k, " & ", round(v,3), " & ", round(PR_2[k],3), " & ", round(PR_3[k],3), "\\\\")


    # PLOT RESULTS
    # ============
    x = list(PR_1.keys())
コード例 #34
0
sys.path.append('data/movie-actor/')

from make_casting_graph import oneway_to_bidirected_graph
g = oneway_to_bidirected_graph(graph)

from pagerank import pagerank, jypagerank

print()
print(
    '--------------------------- TOP RESULT by PAGERANK ALGORITHM ---------------------------'
)
print()
start = time.time()
rank = pagerank(g,
                bias=None,
                df=0.15,
                max_iter=30,
                converge_error=0.001,
                verbose=1)

# top rank movie
# filtering Korean movie
movie_rank = {node: rank for node, rank in rank.items() if node[0] == 'm'}
actor_rank = {node: rank for node, rank in rank.items() if node[0] == 'a'}

korean_movies = {
    movie: weight
    for movie, weight in movie_rank.items()
    if '한국)' in idx2movie(movie.split()[1])
}
list1 = []
コード例 #35
0
ファイル: test.py プロジェクト: xylophone234/pagerank
import numpy as np
size = 100
h = np.arange(size * size).reshape(size, size).astype(np.float)
# every column added to 1.0
h = h / h.sum(axis=0)
# save to 3 files
np.save('h_0_30.npy', h[0:30])
np.save('h_30_60.npy', h[30:60])
np.save('h_60_100.npy', h[60:100])

import pagerank
x = pagerank.pagerank('fn_chunk.txt', size)
print x
コード例 #36
0
    sys.exit(2)

for o, a in opts:
    if o == '-i':  # input directory
        input_directory = a
    elif o == '-d':  # dictionary file
        output_file_dictionary = a
    elif o == '-p':  # postings file
        output_file_postings = a
    else:
        assert False, "unhandled option"

if input_directory == None or output_file_postings == None or output_file_dictionary == None:
    usage()
    sys.exit(2)

G, url_map, doc_id_map = crawler(input_directory)
to_remove = list()
for node in G.nodes():
    keep = False
    for doc_id, url_nb in doc_id_map.items():
        if node == url_nb:
            keep = True
    if not keep:
        to_remove.append(node)
G.remove_nodes_from(to_remove)

pr_result = pagerank(G)
#print(pr_result)
build_index(input_directory, output_file_dictionary, output_file_postings)
コード例 #37
0
import networkx as nx
import pagerank

all_paper_list = list()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        all_paper_list.append(row[0])

graph = nx.DiGraph()
with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_paper_list:
            if len(row) == 8:
                cite_paper_str = row[7]
                if cite_paper_str == '[]':
                    graph.add_node(row[0])
                else:
                    cite_paper_li = cite_paper_str[2:-2].split('\', \'')
                    for cite_paper in cite_paper_li:
                        graph.add_edge(cite_paper, row[0])
            else:
                graph.add_node(row[0])
print "graph create over"

rank_value_dic = pagerank.pagerank(graph, 1000)

file_path = r'..\result\paper_pagerank_value.csv'
pagerank.save_pagerank_value(rank_value_dic, file_path)