コード例 #1
0
def load_neo4j_rdd(sc,
                   neo4j_count_query,
                   neo4j_main_query,
                   offset=0,
                   limit=1500):
    neo4j_rdd = sc.parallelize([], 4)

    total_count = neo4j_con.run(neo4j_count_query).evaluate() or 0
    pages = total_count / limit + (1 if total_count % limit else 0)
    print "Stats - Total record(s): {total_count} | Limit: {limit} | Total pages/pass:{pages}".format(
        total_count=total_count, limit=limit, pages=pages)
    start_time = time.asctime()
    print "Starting @ ", start_time
    for i in xrange(0, pages):
        params = {"offset": offset, "limit": limit}
        cur = neo4j_con.run(neo4j_main_query, params)
        nodes = [{k: v
                  for k, v in node.values()[0].iteritems()} for node in cur]

        #time.sleep(2)
        offset += limit
        # from py2neo.types import Node
        # Node().iteritems()
        print "Loaded {} nodes to RDD @ {}".format(offset + limit,
                                                   time.asctime())
        neo4j_rdd = neo4j_rdd.union(sc.parallelize(nodes))
    print "Finished at ", time.asctime()
    print "Processed {} records and loaded in RDD. Started @ {} Finished @ {}"\
        .format(total_count, start_time, time.asctime())
    return neo4j_rdd
コード例 #2
0
def create_indices():
    prod_id_index = "CREATE INDEX ON :Product(product_id)"
    prod_asin_index = "CREATE INDEX ON :Product(asin)"
    category_id_index = "CREATE INDEX ON :Category(id)"
    customer_id_index = "CREATE INDEX ON :Customer(customer_id)"
    customer_num_id_index = "CREATE INDEX ON :Customer(num_id)"

    neo4j_con.run(prod_id_index)
    neo4j_con.run(prod_asin_index)
    neo4j_con.run(category_id_index)
    neo4j_con.run(customer_id_index)
    neo4j_con.run(customer_num_id_index)
コード例 #3
0
def test():
    cur = neo4j_con.run("MATCH (p:Product)-[r:REVIEWED_BY]->(c:Customer) RETURN c")
    nodes = []
    count = 0
    for node_tups in cur:
        count += 1
        nodes_list = []
        for node_tup in node_tups.values():
            print node_tup
            if type(node_tup) == dict:
                nodes_dict = {k: v for k, v in node_tup.iteritems()}
                nodes_list.append(nodes_dict)
            else:
                nodes_list.append(node_tup)
        nodes.append(deepcopy(nodes_list))

    print json.dumps(nodes)
    print count
コード例 #4
0
def load_neo4j_rdd(sc, neo4j_main_query, offset=0, limit=1500):
    neo4j_rdd = sc.parallelize([], 4)

    # total_count = neo4j_con.run(neo4j_count_query).evaluate() or 0
    # pages = total_count/limit + (1 if total_count%limit else 0)
    # print "Stats - Total record(s): {total_count} | Limit: {limit} | Total pages/pass:{pages}".format(
    #     total_count=total_count, limit=limit, pages=pages)
    start_time = time.asctime()
    print "Starting @ ", start_time
    #for i in xrange(0, pages):
    while True:
        params = {"offset": offset, "limit": limit}
        cur = neo4j_con.run(neo4j_main_query, params)
        #nodes = [{ k: v for k, v in node.values()[0].iteritems()} for node in cur]
        nodes = []
        empty = True
        for node_tups in cur:
            empty = False
            nodes_list = []
            for node_tup in node_tups.values():
                if type(node_tup) == dict:
                    nodes_dict = {k: v for k, v in node_tup.iteritems()}
                    nodes_list.append(nodes_dict)
                else:
                    nodes_list.append(node_tup)
            nodes.append(deepcopy(nodes_list))

        offset += limit
        if empty:
            break
        # from py2neo.types import Node
        # Node().iteritems()
        print "Loaded {} nodes to RDD @ {}".format(offset+limit, time.asctime())
        neo4j_rdd = neo4j_rdd.union(sc.parallelize(nodes))

    print "Finished at ", time.asctime()
    print "Loaded records in RDD. Started @ {} Finished @ {}"\
        .format(start_time, time.asctime())
    return neo4j_rdd