Example #1
0
 def run_pagerank(self, damping_factor=0.85, iterations=10):
     return pagerank(self.index_file,
                     self.edge_file,
                     self.max_node,
                     self.edge_count,
                     damping_factor=damping_factor,
                     iterations=iterations)
Example #2
0
 def test_pagerank(self):
     size = 1000
     g = nx.DiGraph(nx.powerlaw_cluster_graph(size, 3, 0.001))
     N = len(g.nodes())
     tmp_file = tempfile.NamedTemporaryFile(delete=False)
     for node in g.nodes():
         outlinks = g.out_edges(nbunch=[node])
         outlinks = map(str, [n2 for n1, n2 in outlinks])
         if not outlinks:
             value = 'pr_results,%s,%s' % (1.0/N, N)
             tmp_file.write('%s\t%s\n' % (node, value))
         else:
             outlinks_str = ','.join(outlinks)
             value = 'pr_results,%s,%s,' % (1.0/N, N)
             value += outlinks_str
             tmp_file.write('%s\t%s\n' % (node, value))
     tmp_file.flush()
     input_path = tmp_file.name
     job_id = 'unittest'
     sorted_ids = pagerank(job_id, self.iter_count, input_path, self.top_n)
     fs = HadoopFS()
     fs.rmr('%s/hat_results' % job_id)
     if self.top_n <= size: 
         self.assertEqual(len(sorted_ids), self.top_n, 'some ids is missing')
     id_ranges = range(0, 1000)
     for _id in sorted_ids:
         self.assertIn(int(_id), id_ranges, 'node should in graph')
Example #3
0
def netrank_gridsearch(network_path, diff_expr, out_path, alpha_prec=11):
    """Perform grid search over alpha parameter.

    This function will compute the netranks for a given network and differential
    expression data for a range of alpha parameters.

    Parameters:
    ----------
    network_path:           Path to the network in hdf5 container (with gene names).
                            Gene names and expression data are assumed to be in
                            the same order as the nodes in the adjacency matrix.

    diff_expr:              Differential expression dataframe. If set to None,
                            PageRank is calculated, otherwise NetRank will be used.

    out_path:               Directory to which the results are written.

    alpha_prec:             The number of runs to compute netrank scores.
                            Default is 10, which corresponds to computing ranks
                            for alpha = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1].
    """
    alpha_range = np.linspace(0, 1, alpha_prec)
    print("Running grid search for alpha={}".format(alpha_range))
    for alpha in alpha_range:
        scores, gene_names = pagerank.pagerank(network_path, diff_expr, alpha)
        out = os.path.join(out_path, 'netrank_alpha_{}.txt'.format(alpha))
        pagerank.write_ranking(scores, gene_names, out)
        print("Netrank for alpha {} computed successfully!".format(alpha))
    print("Grid Search successfully computed (results in {})".format(out_path))
Example #4
0
	def test_simple(self):
		graph_dict = {"A": {},
					 "B": {"A":{}, "C":{}},
					 "C": {"A": {}},
					 "D": {"A":{}, "B":{}, "C":{}}
					 }	
		graph = Graph(graph_dict)
		ranks = pagerank(graph, iterations=10)
		math.isclose(ranks["A"], 0.4583333)
def main():

    sentences = ["i am paradox", "my name is paradox", "i love caffeine"]
    sentences_tokenized = list(map(lambda x: x.split(), sentences))
    print(sentences_tokenized)
    matrix = build_sim_matrix(sentences_tokenized)
    print(matrix)
    result = pagerank(matrix)
    print(result)
Example #6
0
def undirected_page_rank(q, D, p, sim, th, priors):
    results = {}
    doc_ids = q['visited_documents']
    sim_graph = build_graph(cl.get_subset(D, doc_ids), sim, th)

    if priors == 'baseline':    # if baseline tries priors regarding 1st delivery
        priors_vec = {doc_id: 1/(50 + rank) for doc_id,rank in q['visited_documents_orders'].items()}
    elif priors == 'classification': # if classification tries priors regarding 2nd delivery
        priors_vec = q['document_probabilities']

    pr_values = {'vanilla_pk': pk.pagerank(sim_graph, max_iter=50, weight=None),
                'extended_pk': pk.pagerank(sim_graph, max_iter=50, weight='weight',
                                            personalization=priors_vec)}

    for pr_type in pr_values:
        results[pr_type] = cl.get_subset(pr_values[pr_type],
                                         sorted(list(pr_values[pr_type].keys()), key=lambda x: pr_values[pr_type][x],
                                                reverse=True)[:p])
    return results
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('damping', nargs='?', help='The damping factor to use.', type=int, default=0.85)
    args = parser.parse_args()
    graph = endograph.endograph(sys.stdin.read())
    scores = zip(sorted(graph.nations), pagerank.pagerank(graph.matrix(), args.damping))
    print("\n".join(
        "%.10f %s" % (b,a) for (a,b) in 
        sorted(scores, key=lambda z:z[1], reverse=True)
        ))
Example #8
0
	def test_with_iter_and_dampening(self):
		graph_dict = {"A": {},
					 "B": {"A":{}, "C":{}},
					 "C": {"A": {}},
					 "D": {"A":{}, "B":{}, "C":{}}
					 }			
		graph = Graph(graph_dict)
		ranks = pagerank(graph, iterations=0, d=1)
		math.isclose(ranks["A"], 0.50747)		
		math.isclose(ranks["B"], 0.27431)
		math.isclose(ranks["C"], 0.1925)
		math.isclose(ranks["D"], 0.15)
def pagerank_aggregator(objects, threshold, alpha):
    """Implements the pagerank aggregation for a given alpha and epsilon.
    Alpha is for the bias towards surf probability, non-random in this case.
    Epsilon controls the convergence threshold, a small number in practice.

    The program first constructs the graph given the rankers and then calls
    pagerank function repeatedly until the average change in pagerank scores 
    is below epsion. Then, it converts the resulting scores into a ranking.

    """

    ### Construct graph version of the rankers and update indegrees
    indegrees = {}
    total_indegrees = 0.0
    graph = {}
    for key in objects.keys():
        graph[key] = []
        indegrees[key] = 0.0

    for key1 in objects.keys():
        for key2 in objects.keys():
            if key1 != key2:
                count = num_higher(objects, key1, key2) 
                if count > 0:
                    graph[key1].append( (key2,float(count)) )
                    indegrees[key2] += count
                    total_indegrees += count


    ##Normalize to add the outlinks to 1
    for key1 in graph.keys():
        total = 0
        for (key2,val) in graph[key1]:
            total += val
        for i in range(len(graph[key1])):
            (key2, val) = graph[key1][i]
            graph[key1][i] = (key2, val/total)

    ##Normalize indegrees as well
    for key in indegrees.keys():
        indegrees[key] /= total_indegrees


    ### Call page rank
    final_scores = pg.pagerank(graph, indegrees, threshold, alpha)

    #final_scores = pg.pagerank(graph, {}, threshold, alpha)

    ### Convert the final scores to a ranking
    ranker = get_ranker_for_scores(final_scores)

    rankscore = kendall_tau(objects, ranker)
    return ranker, rankscore
Example #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('damping',
                        nargs='?',
                        help='The damping factor to use.',
                        type=int,
                        default=0.85)
    args = parser.parse_args()
    graph = endograph.endograph(sys.stdin.read())
    scores = zip(sorted(graph.nations),
                 pagerank.pagerank(graph.matrix(), args.damping))
    print("\n".join(
        "%.10f %s" % (b, a)
        for (a, b) in sorted(scores, key=lambda z: z[1], reverse=True)))
Example #11
0
def pagerank_simluation_test(input_path):
    iter_count = 5
    top_n = 500
    job_id = 1
    sorted_ids = pagerank(job_id, iter_count, input_path, top_n)
    if sorted_ids:
        if len(sorted_ids) < 10:
            for i in range(len(sorted_ids)):
                print sorted_ids[i]
        else:
            for i in range(10):
                print sorted_ids[i]
    fs = HadoopFS()
    fs.rmr('%s/hat_results' % job_id)
Example #12
0
def find_key_phrases(tokens, parts_of_speech, window):

    tagged_tokens = nltk.pos_tag(tokens)

    nodes = []
    for token in tokens:
         nodes.append((token, 1))

    edges = []
    for i in range(0, len(tagged_tokens)):
        if tagged_tokens[i][1] in parts_of_speech:
            right = min(i + window, len(tagged_tokens))
            for j in range(i+1, right):
                vertex = (tokens[i], tokens[j], {"weight": 1})
                edges += [vertex]
    return pagerank(nodes, edges, 15)
Example #13
0
def pagerank_rank(top_n, date, topic_id, window_size):
    data = []

    tmp_file = prepare_data_for_pr(topic_id, date, window_size)

    if not tmp_file:
        return data

    input_tmp_path = tmp_file.name
    
    
    job_id = generate_job_id(datetime2ts(date), window_size, topic_id)
    iter_count = PAGERANK_ITER_MAX

    sorted_uids = pagerank(job_id, iter_count, input_tmp_path, top_n)

    print sorted_uids
Example #14
0
 def pagerankCenter(graph):
     if len(graph.nodes) == 1:
         return graph.nodes[0], 0
     simMatrix = pagerank(graph, graph.nodes)
     sumDist = dict()
     minDist = float("inf")
     centerList = list()
     center = None
     for node in graph.nodes:
         sumDist[node] = sum(list(simMatrix[node].values()))
         if minDist > sumDist[node]:
             centerList = [node]
             minDist = sumDist[node]
         elif minDist == sumDist[node]:
             centerList.append(node)
     center = centerList[0]
     radius = nx.eccentricity(graph, v=center)
     return center, radius
Example #15
0
def main():
    parser = argparse.ArgumentParser(description='Running PageRank')
    parser.add_argument('path_file',
                        metavar='f',
                        type=str,
                        help='path to file to perform page ranking')
    parser.add_argument('beta',
                        metavar='b',
                        type=float,
                        help='allow teleportation with 1-beta probability')
    parser.add_argument('eps',
                        metavar='eps',
                        type=float,
                        help='epsilon value for convergence')
    parser.add_argument('n_nodes',
                        metavar='nodes',
                        type=int,
                        help='number of unique nodes')
    parser.add_argument('n_edges',
                        metavar='edges',
                        type=int,
                        help='number of all edges')
    parser.add_argument('pow_iter',
                        metavar='pow_iter',
                        type=int,
                        help='number of iterations')

    args = parser.parse_args()
    file_path = args.path_file
    beta = args.beta
    eps = args.eps
    n_nodes = args.n_nodes
    n_edges = args.n_edges
    num_iterations = args.pow_iter

    result = pagerank(file_name=file_path,
                      beta=beta,
                      power_iterations=num_iterations,
                      num_edges=n_edges,
                      num_nodes=n_nodes,
                      eps=eps)
Example #16
0
def pagerank_rank(top_n, date, topic_id, window_size):
    data = []

    tmp_file = prepare_data_for_pr(topic_id, date, window_size)

    if not tmp_file:
        return data

    input_tmp_path = tmp_file.name
    
    job_id = generate_job_id(datetime2ts(date), window_size, topic_id)
    iter_count = PAGERANK_ITER_MAX

    sorted_uids = pagerank(job_id, iter_count, input_tmp_path, top_n)

    topicname = acquire_topic_name(topic_id)
    if not topicname:
        return data

    data = save_rank_results(sorted_uids, 'topic', 'pagerank', date, window_size, topicname)

    return data
    for row in reader:
        all_realted_paper_li.append(row[0])
print "all_realted_paper_li create over"
print "len(all_realted_paper_li):", len(all_realted_paper_li)

# 读取作者前五年论文
paper5_set = set()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_realted_paper_li:
            paper5_set.add(row[0])
print "paper5_set and paper5_li create over"
print "len(paper5_set):", len(paper5_set)

with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in paper5_set:
            graph.add_node(row[0])
            cite_paper_li = row[7][2: -2].split('\', \'')
            for cite_paper in cite_paper_li:
                if cite_paper in all_realted_paper_li:
                    graph.add_edge(row[0], cite_paper)
print "graph create over"
print graph.number_of_nodes()
print graph.number_of_edges()

rank_value = pagerank.pagerank(graph, max_iteration=1000)
pagerank.save_pagerank_value(rank_value, r'..\result\paper_unweighted_pagerank.csv')
def crawler():
    try:
        conn = sqlite3.connect(db_file)
        conn.text_factory = str
        pool = threadpool.ThreadPool(16)

        # Init keywords
        with open(word_file, "rb") as f:
            keywords = pickle.load(f)
        with open(searched_word_file, "rb") as f:
            searched = pickle.load(f)

        while True:
            keywords_list = list(keywords)
            while len(keywords_list) != 0:
                init_vals = []
                for k in keywords_list[:key_batch_size]:
                    for site in search_sites:
                        for search_page in site['search_page']:
                            init_vals.append(([
                                site['domain'], search_page, site['header'], k
                            ], None))

                threqs = threadpool.makeRequests(proc_search, init_vals)
                [pool.putRequest(req) for req in threqs]
                pool.wait()

                print("Commiting...")

                cursor = conn.cursor()

                for cmd in webpage_cmds:
                    try:
                        cursor.execute(cmd[0], cmd[1])
                    except:
                        pass

                for cmd in word_cmds:
                    try:
                        cursor.execute(cmd[0], cmd[1])
                    except:
                        pass

                for cmd in ref_cmds:
                    try:
                        cursor.execute(cmd[0], cmd[1])
                    except:
                        pass

                cursor.close()
                conn.commit()

                webpage_cmds.clear()
                ref_cmds.clear()
                word_cmds.clear()

                print("Ranking and indexing...")
                pagerank()
                index()

                print("Saving keywords...")

                for k in keywords_list[:key_batch_size]:
                    searched.add(k)

                keywords_list = keywords_list[key_batch_size:]

                with open(word_file, "wb") as f:
                    pickle.dump(set(keywords_list), f)

                with open(searched_word_file, "wb") as f:
                    pickle.dump(searched, f)

                print("Done.")

            keywords = new_key.copy()
            new_key.clear()

            print("Iterator Done.")

    except Exception as e:
        print(e)
    finally:
        conn.close()
Example #19
0
''' Run PageRank algorithm on the moon landing webgraph '''
from graph import adjacency_list
from pagerank import pagerank
from results import output_rank_json, output_rank_csv, output_random_walks
from randomwalks import random_walks

# Parse dataset into adjacency matrix
adj_list = adjacency_list("data/adj_list")
# Non scaled pagerank
rank = pagerank(adj_list, 10)

# output results of basic pagerank
output_rank_json("data/nodes", rank, "out/unscaledranking.json")
output_rank_csv("data/nodes", rank, "out/unscaledranking.csv")


# Scaled pagerank s = 0.85
scaled_rank = pagerank(adj_list, 10, scaled=True)

# output results of scaled pagerank
output_rank_json("data/nodes", scaled_rank, "out/scaledranking.json")
output_rank_csv("data/nodes", scaled_rank, "out/scaledranking.csv")

# Scaled pagerank s = 0.7
scaled_rank = pagerank(adj_list, 10, scaled=True, s=0.7)
output_rank_csv("data/nodes", scaled_rank, "out/scaledranking7.csv")

# Scaled pagerank s = 0.5
scaled_rank = pagerank(adj_list, 10, scaled=True, s=0.5)
output_rank_csv("data/nodes", scaled_rank, "out/scaledranking5.csv")
Example #20
0
def epidemic_propagation(
        adjencyMatrix,
        scenario="noVaccination",
        initContamination=0.05,
        initVaccination=0.12,
        contaminationRate=0.2,
        cureRate=0.26,
        alpha=0.85,
        epsilon=0.00001):
    """
    Computes the propagation of an epidemy

    Args:
        adjencyMatrix: The adjency matrix of the graph
        scenario: The scenario to execute (noVaccination, randomVaccination, specificVaccination)
        initContamination: X% of the initially infected people
        initVaccination: Y% of the initially vaccinated people
        contaminationRate: % of chance to contamine a neighbour
        cureRate: Self cure rate
        alpha: Dumping factor

    Returns:

    """
    size = adjencyMatrix.shape[0]

    # Vaccinate people
    if scenario == "PRVaccination":  # PageRank vaccination (Most important nodes are vaccinate)
        vaccinated_nodes = pagerank(adjencyMatrix, alpha, epsilon)[: int(size * initVaccination)]
    elif scenario == "randomVaccination":  # Random vaccination
        vaccinated_nodes = np.random.choice(size, int(size * initVaccination), replace=False)
    else:  # No vaccination
        vaccinated_nodes = []

    adjencyMatrix = np.delete(adjencyMatrix, vaccinated_nodes, 0)
    adjencyMatrix = np.delete(adjencyMatrix, vaccinated_nodes, 1)

    # Select randomly nodes that are initially infected
    size = adjencyMatrix.shape[0]
    contam_vector = np.random.choice(size, int(size * initContamination))

    # Infection ratio is the parameter given
    res = [contaminationRate]

    # Iterations
    for i in range(200):
        # Get the neighbours of infected nodes
        neighbours = adjencyMatrix[contam_vector, :].nonzero()[1]
        # Infect neighbour (contaminationRate probability)
        contam_vector = np.unique(
            np.concatenate(
                (
                    contam_vector,
                    np.extract(np.random.rand(neighbours.shape[0]) < contaminationRate, neighbours),
                )
            )
        )

        # Infect non-neighbour (1-alpha probability)
        non_neighbours = np.array(list(set(np.arange(0, size)) - set(contam_vector) - set(neighbours)))
        if True and len(non_neighbours) > 0:
            contam_vector = np.unique(
                 np.concatenate(
                    (
                        contam_vector,
                        np.extract(
                            np.random.rand(non_neighbours.shape[0]) < (1 - alpha) / non_neighbours.shape[0], non_neighbours
                        ),
                    )
                )
            )

        # Cure
        contam_vector = np.delete(
            contam_vector, np.where(np.random.rand(contam_vector.shape[0]) < cureRate)[0]
        )

        # Get the number of infected individuals
        res.append(float(len(contam_vector)))

    print("Results : ", list(enumerate(res)))
    return zip(*list(enumerate(res)))
        dataFile = open(self.data_file)
        for line in open(self.data_file):
            A=line.strip().split('\t')[0]
            B=line.strip().split('\t')[1]
            self.List.append(A)
            self.List.append(B)
        dataFile.close()

if __name__ == '__main__':

    PRdata = PRdata("data/WikiData.txt")
    dg = Graph.Graph()
    dg.add_nodes(PRdata.List)
    with open("data/WikiData.txt","r") as f:
         for line in f:
            A=line.strip().split('\t')[0]
            B=line.strip().split('\t')[1]
            dg.add_edge((A, B))
    f.close()
    #print(dg.node_n)
    pr = pagerank.pagerank(dg)
    page_ranks = pr.page_rank()

    print("The final page rank is saving...")
    with open("results/result.txt","w") as f_w:
        for key, value in page_ranks.items():
            #print(key + " : ", value[0])
            f_w.write(key + '\t\t' + str(value[0]) + '\n')
    f_w.close()
    print("The final page rank is okey.")
input('请按任意键退出...')
Example #22
0
                        nargs=1,
                        help='input file')
    args = parser.parse_args()

    print('executing with damping factor {}, epsilon {}'.format(
        args.d, args.e))

    # read the file

    print('constructing graph...')

    g = load_graph(args.ifile[0])

    print('{} nodes loaded.'.format(g.nNode))

    #

    print('start pageranking...')

    startTime = time.perf_counter()
    pg = pagerank.pagerank(g, args.d, args.e)
    endTime = time.perf_counter()

    print('Time spent: {}s'.format(endTime - startTime))

    #

    print('store result...')

    save_pagerank(pg, args.o)
def textrank(sentences_tokenized, topn=5):
    matrix = build_sim_matrix(sentences_tokenized)
    ranks = pagerank(matrix)
    return matrix, ranks
Example #24
0
    parser.add_argument('ifile', metavar='input-file', nargs=1,
            help='input file')
    args = parser.parse_args()

    print('executing with damping factor {}, epsilon {}'.format(
        args.d, args.e))

    # read the file

    print('constructing graph...')

    g = load_graph(args.ifile[0])

    print('{} nodes loaded.'.format(g.nNode))

    #

    print('start pageranking...')

    startTime = time.perf_counter()
    pg = pagerank.pagerank(g, args.d, args.e)
    endTime = time.perf_counter()

    print('Time spent: {}s'.format(endTime-startTime))

    #

    print('store result...')

    save_pagerank(pg, args.o)
Example #25
0
import graph_generate as gg
import matrix_generate as mg
import pagerank as pr

if __name__ == '__main__':
    dg = gg.create_network()
    adj_matrix = mg.create_matrix(dg)
    pr_vec = mg.create_pr_vector(dg)
    pr.pagerank(adj_matrix, pr_vec)
Example #26
0
        lambda x: (x['page_id'], x['title'], x['url'], x['content']))

    (documents_rdd.distinct().map(lambda (p, t, u, c): {
        "page_id": p,
        "title": t,
        "url": u,
        "content": c
    }).map(lambda x: json.dumps(x)).saveAsTextFile(DOCUMENTS_SAVE_PATH))

    print 'start get pagerank'
    pageID_rdd = barrel_rdd.map(lambda x: x['page_id'])
    title_to_pageID = (barrel_rdd.map(
        lambda x: (x['title'], x['page_id'])).distinct().collectAsMap())
    links_rdd = (load_local.pagelinksToDataframe(sc).rdd.map(
        lambda x: (x.pl_from, title_to_pageID.get(x.pl_title, -1))))
    ranks_rdd = pagerank.pagerank(pageID_rdd, links_rdd, 5)

    print 'start get tf'
    tf_rdd = (barrel_rdd.flatMap(lambda x: x['words_with_meta']).map(
        lambda (word, (page_id, is_title, tf)): (word_to_wordID[word],
                                                 (page_id, tf))).cache())

    print 'start get df'
    df_rdd = (barrel_rdd.flatMap(lambda x: x['unique_words']).map(
        lambda word: (word_to_wordID[word], 1)).reduceByKey(add).cache())

    print 'start get tfidf'
    tfidf_rdd = (
        tf_rdd.join(df_rdd).map(lambda (word_id, ((page_id, tf), df)): (
            (word_id, page_id), round(tf / float(df), 5))).cache())
        all_realted_paper_li.append(row[0])
print "all_realted_paper_li create over"
print "len(all_realted_paper_li):", len(all_realted_paper_li)

# 读取作者前五年论文
paper5_set = set()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_realted_paper_li:
            paper5_set.add(row[0])
print "paper5_set and paper5_li create over"
print "len(paper5_set):", len(paper5_set)

with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in paper5_set:
            graph.add_node(row[0])
            cite_paper_li = row[7][2:-2].split('\', \'')
            for cite_paper in cite_paper_li:
                if cite_paper in all_realted_paper_li:
                    graph.add_edge(row[0], cite_paper)
print "graph create over"
print graph.number_of_nodes()
print graph.number_of_edges()

rank_value = pagerank.pagerank(graph, max_iteration=1000)
pagerank.save_pagerank_value(rank_value,
                             r'..\result\paper_unweighted_pagerank.csv')
Example #28
0
 def pagerank(self):
     return pagerank(self.tomatrix())
Example #29
0
    parsed_page_rdd = sc.textFile(load.PARSED_PAGE_PATH).map(util.encode)
    parsed_page_rdd.cache()
    docid_words = parsed_page_rdd.keys() \
                                  .map(lambda x: (x['url'], x['words'])) \
                                  .join(url_docid) \
                                  .map(lambda (url, (words, doc_id)): (doc_id, words))
    links_rdd = parsed_page_rdd.values().flatMap(lambda x: x)

    word_word_id = sc.textFile(load.WORD_LIST_PATH).map(
        util.encode).map(lambda x: (x['word'], x['word_id']))

    print 'start links'

    print 'start get pagerank'

    ranks_rdd = pagerank.pagerank(doc_id_rdd, links_rdd, 5)

    # Create tf idf and inverted index
    MINIMUM_VALID_DF = 10
    words_with_meta = docid_words.join(ranks_rdd) \
                                 .flatMap(lambda (doc_id, (words, rank)): [(word, (doc_id, meta, rank)) for (word, meta) in words]) \
                                 .join(word_word_id) \
                                 .map(lambda (word, ((doc_id, meta, rank), word_id)): ((word_id, word), (doc_id, meta, rank))) \
                                 .groupByKey() \
                                 .map(lambda (word_with_id, docs): (word_with_id, docs, len(docs))) \
                                 .filter(lambda (word_with_id, docs, df): df > MINIMUM_VALID_DF)
    words_with_meta.cache()

    words_with_meta.map(lambda ((word_id, word), docs, df): (word_id, [(doc_id, h, s, t, round(tf * df, 5), rank) for (doc_id, (h, s, t, tf), rank) in docs])) \
                   .map(lambda (word_id, docs): (word_id, [{"doc_id": doc_id, "tfidf": tfidf, "header": h, "style": s, "title": t, "pagerank": r} for (doc_id, h, s, t, tfidf, r) in docs])) \
                   .flatMap(lambda (word_id, docs): [(word_id, i, sub_docs) for i, sub_docs in enumerate(list(chunked(docs, int(math.ceil(len(docs)/float(DIVIDE_INDEX))))))]) \
Example #30
0
import csv
from lematization import lemm_str
from pagerank import pagerank
import operator

def search(query: str, index_csv_path: str) -> [str]:
    lemms = lemm_str(query)
    doc_lists = []

    with open(index_csv_path) as f:
        reader = csv.DictReader(f, delimiter=',')
        for line in reader:
            for word in lemms:
                doc_lists.append(ast.literal_eval(line.get(word, '[]')))

    result = set()
    if len(doc_lists) > 0:
        result.update(set(doc_lists[0]))

    for docs in doc_lists:
        result.intersection_update(docs)
    return list(result)

if __name__ == "__main__":
    query = input("Enter your request:")
    answers = search(query, "./inverted_index.csv")
    for item in pagerank(answers):
        print(item)


Example #31
0
all_paper_list = list()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        all_paper_list.append(row[0])


graph = nx.DiGraph()
with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_paper_list:
            if len(row) == 8:
                cite_paper_str = row[7]
                if cite_paper_str == '[]':
                    graph.add_node(row[0])
                else:
                    cite_paper_li = cite_paper_str[2: -2].split('\', \'')
                    for cite_paper in cite_paper_li:
                        graph.add_edge(cite_paper, row[0])
            else:
                graph.add_node(row[0])
print "graph create over"


rank_value_dic = pagerank.pagerank(graph, 1000)

file_path = r'..\result\paper_pagerank_value.csv'
pagerank.save_pagerank_value(rank_value_dic, file_path)
Example #32
0
from graph import Node, Graph
from pagerank import pagerank

a = Node()
b = Node()
c = Node()
d = Node()

g = Graph()

g.add_node('a', a)
g.add_node('b', b)
g.add_node('c', c)
g.add_node('d', d)

g.add_edge('b', 'c')
g.add_edge('b', 'a')
g.add_edge('c', 'a')
g.add_edge('d', 'a')
g.add_edge('d', 'b')
g.add_edge('d', 'c')

ranks = pagerank(g)

for node, value in ranks.items():
    print(node.name, value)
        for line in f:
            line = line.strip().split(" ")
            n_0 = int(line[0])
            path_count = int(line[-1])
            start_nodes[n_0] += path_count
            total_number_of_items_manufactured += path_count

    start_nodes = {k:v/total_number_of_items_manufactured for k,v in start_nodes.items()}


    # CALCULATE PAGERANK FOR DIFFERENT CONDITIONS
    # ===========================================
    # Case 1: Original algorithm
    E = {k:1 for k,v in edges.items()}
    start_nodes_all = {n:1/len(G.nodes) for n in G.nodes}
    PR_1 = pagerank.pagerank(E, list(G.nodes), start_nodes_all, B=0.85)

    # Case 2: Edge weights considered
    PR_2 = pagerank.pagerank(edges, list(G.nodes), start_nodes_all, B=0.85)

    # Case 3: Edge weights and start nodes considered
    PR_3 = pagerank.pagerank(edges, list(G.nodes), start_nodes, B=0.85)

    # Print table for LaTeX
    for k,v in PR_1.items():
        print(k, " & ", round(v,3), " & ", round(PR_2[k],3), " & ", round(PR_3[k],3), "\\\\")


    # PLOT RESULTS
    # ============
    x = list(PR_1.keys())
sys.path.append('data/movie-actor/')

from make_casting_graph import oneway_to_bidirected_graph
g = oneway_to_bidirected_graph(graph)

from pagerank import pagerank, jypagerank

print()
print(
    '--------------------------- TOP RESULT by PAGERANK ALGORITHM ---------------------------'
)
print()
start = time.time()
rank = pagerank(g,
                bias=None,
                df=0.15,
                max_iter=30,
                converge_error=0.001,
                verbose=1)

# top rank movie
# filtering Korean movie
movie_rank = {node: rank for node, rank in rank.items() if node[0] == 'm'}
actor_rank = {node: rank for node, rank in rank.items() if node[0] == 'a'}

korean_movies = {
    movie: weight
    for movie, weight in movie_rank.items()
    if '한국)' in idx2movie(movie.split()[1])
}
list1 = []
Example #35
0
import numpy as np
size = 100
h = np.arange(size * size).reshape(size, size).astype(np.float)
# every column added to 1.0
h = h / h.sum(axis=0)
# save to 3 files
np.save('h_0_30.npy', h[0:30])
np.save('h_30_60.npy', h[30:60])
np.save('h_60_100.npy', h[60:100])

import pagerank
x = pagerank.pagerank('fn_chunk.txt', size)
print x
    sys.exit(2)

for o, a in opts:
    if o == '-i':  # input directory
        input_directory = a
    elif o == '-d':  # dictionary file
        output_file_dictionary = a
    elif o == '-p':  # postings file
        output_file_postings = a
    else:
        assert False, "unhandled option"

if input_directory == None or output_file_postings == None or output_file_dictionary == None:
    usage()
    sys.exit(2)

G, url_map, doc_id_map = crawler(input_directory)
to_remove = list()
for node in G.nodes():
    keep = False
    for doc_id, url_nb in doc_id_map.items():
        if node == url_nb:
            keep = True
    if not keep:
        to_remove.append(node)
G.remove_nodes_from(to_remove)

pr_result = pagerank(G)
#print(pr_result)
build_index(input_directory, output_file_dictionary, output_file_postings)
Example #37
0
import networkx as nx
import pagerank

all_paper_list = list()
with open(r'..\result\paper5.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        all_paper_list.append(row[0])

graph = nx.DiGraph()
with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] in all_paper_list:
            if len(row) == 8:
                cite_paper_str = row[7]
                if cite_paper_str == '[]':
                    graph.add_node(row[0])
                else:
                    cite_paper_li = cite_paper_str[2:-2].split('\', \'')
                    for cite_paper in cite_paper_li:
                        graph.add_edge(cite_paper, row[0])
            else:
                graph.add_node(row[0])
print "graph create over"

rank_value_dic = pagerank.pagerank(graph, 1000)

file_path = r'..\result\paper_pagerank_value.csv'
pagerank.save_pagerank_value(rank_value_dic, file_path)