def load_neo4j_rdd(sc, neo4j_count_query, neo4j_main_query, offset=0, limit=1500): neo4j_rdd = sc.parallelize([], 4) total_count = neo4j_con.run(neo4j_count_query).evaluate() or 0 pages = total_count / limit + (1 if total_count % limit else 0) print "Stats - Total record(s): {total_count} | Limit: {limit} | Total pages/pass:{pages}".format( total_count=total_count, limit=limit, pages=pages) start_time = time.asctime() print "Starting @ ", start_time for i in xrange(0, pages): params = {"offset": offset, "limit": limit} cur = neo4j_con.run(neo4j_main_query, params) nodes = [{k: v for k, v in node.values()[0].iteritems()} for node in cur] #time.sleep(2) offset += limit # from py2neo.types import Node # Node().iteritems() print "Loaded {} nodes to RDD @ {}".format(offset + limit, time.asctime()) neo4j_rdd = neo4j_rdd.union(sc.parallelize(nodes)) print "Finished at ", time.asctime() print "Processed {} records and loaded in RDD. Started @ {} Finished @ {}"\ .format(total_count, start_time, time.asctime()) return neo4j_rdd
def create_indices(): prod_id_index = "CREATE INDEX ON :Product(product_id)" prod_asin_index = "CREATE INDEX ON :Product(asin)" category_id_index = "CREATE INDEX ON :Category(id)" customer_id_index = "CREATE INDEX ON :Customer(customer_id)" customer_num_id_index = "CREATE INDEX ON :Customer(num_id)" neo4j_con.run(prod_id_index) neo4j_con.run(prod_asin_index) neo4j_con.run(category_id_index) neo4j_con.run(customer_id_index) neo4j_con.run(customer_num_id_index)
def test(): cur = neo4j_con.run("MATCH (p:Product)-[r:REVIEWED_BY]->(c:Customer) RETURN c") nodes = [] count = 0 for node_tups in cur: count += 1 nodes_list = [] for node_tup in node_tups.values(): print node_tup if type(node_tup) == dict: nodes_dict = {k: v for k, v in node_tup.iteritems()} nodes_list.append(nodes_dict) else: nodes_list.append(node_tup) nodes.append(deepcopy(nodes_list)) print json.dumps(nodes) print count
def load_neo4j_rdd(sc, neo4j_main_query, offset=0, limit=1500): neo4j_rdd = sc.parallelize([], 4) # total_count = neo4j_con.run(neo4j_count_query).evaluate() or 0 # pages = total_count/limit + (1 if total_count%limit else 0) # print "Stats - Total record(s): {total_count} | Limit: {limit} | Total pages/pass:{pages}".format( # total_count=total_count, limit=limit, pages=pages) start_time = time.asctime() print "Starting @ ", start_time #for i in xrange(0, pages): while True: params = {"offset": offset, "limit": limit} cur = neo4j_con.run(neo4j_main_query, params) #nodes = [{ k: v for k, v in node.values()[0].iteritems()} for node in cur] nodes = [] empty = True for node_tups in cur: empty = False nodes_list = [] for node_tup in node_tups.values(): if type(node_tup) == dict: nodes_dict = {k: v for k, v in node_tup.iteritems()} nodes_list.append(nodes_dict) else: nodes_list.append(node_tup) nodes.append(deepcopy(nodes_list)) offset += limit if empty: break # from py2neo.types import Node # Node().iteritems() print "Loaded {} nodes to RDD @ {}".format(offset+limit, time.asctime()) neo4j_rdd = neo4j_rdd.union(sc.parallelize(nodes)) print "Finished at ", time.asctime() print "Loaded records in RDD. Started @ {} Finished @ {}"\ .format(start_time, time.asctime()) return neo4j_rdd