def create_graph(GRAPHDB): # Do we have a node that has a 'name' property, which is set to the value 'Neo'? # We've probably been here before. data, metadata = cypher.execute(GRAPHDB, "START n=node(*) where n.name='Neo' return n") if not data: # Create two nodes, one for us and one for you. # Make sure they both have 'name' properties with values. from_node, to_node = GRAPHDB.create({"name": "Neo"}, {"name": "you"}) # create a 'loves' relationship from the 'from' node to the 'to' node GRAPHDB.create((from_node, "loves", to_node),)
def saveToNeo4j(found_pages, found_links): print "++: saving to neo4j" title_index = GRAPHDB.get_or_create_index(neo4j.Node, "TitleIndex") i=0 for link in found_links: pageA, pageB = link nodeA = title_index.get_or_create("title", pageA, {"title": pageA}) nodeB = title_index.get_or_create("title", pageB, {"title": pageB}) GRAPHDB.create((nodeA, "links_to", nodeB)) if not i % 100: print i i += 1
def add_color_to_db(title, labels=[], hex=None): if not hex: return add_page_to_db(title, labels) node = GRAPHDB.get_or_create_indexed_node("TitleIndex", "title", title, {"title": title, "hex": hex}) for label in labels: node.add_labels(label) node.add_labels("Color") return node
def saveToNeo4jBatch(found_pages, found_links): url_index = GRAPHDB.get_or_create_index(neo4j.Node, "UrlIndex") pageToNode = {} for page in found_pages: name = getNameFromLink(page) node = GRAPHDB.create({"name":name, "url":page})[0] # TODO: add labels based on infobox pageToNode[page] = node # save links i = 0 batch = neo4j.WriteBatch(GRAPHDB) for link in found_links: pageA, pageB = link nodeA = pageToNode.get(pageA) nodeB = pageToNode.get(pageB) batch.get_or_create_path(nodeA, "links_to", nodeB) if not i % 100: print "i: " + str(i) batch.run() batch = neo4j.WriteBatch(GRAPHDB) i += 1 batch.run() print "total num links created: " + str(i)
def handle_row_inner(inner_row): nodeB = inner_row[0] weight = random.random() print "w: " + str(weight) if nodeA != nodeB: GRAPHDB.create((nodeA, "RELEVANCY", nodeB, {"weight": weight}),)
def add_page_to_db(title, labels=[]): node = GRAPHDB.get_or_create_indexed_node("TitleIndex", "title", title, {"title": title}) for label in labels: node.add_labels(label) # node.add_labels("Page") return node
def crawl_pages(pages_to_crawl, depth_remaining, color=False): pages_to_crawl_next = [] pages_to_link = [] depth_remaining -= 1 edge_batch = neo4j.WriteBatch(GRAPHDB) while pages_to_crawl: i = 0 for page in pages_to_crawl: if page in pages_to_link: continue pages_to_link.append(page) title = clean_title(page.title()) infoboxes = get_infoboxes(page) if color: node = add_color_to_db(title, infoboxes, get_hex(page)) else: node = add_page_to_db(title, infoboxes) categories = get_categories(page) for category in categories: adj_node = add_category_to_db(category) edge_batch.get_or_create_path(node, "has_category", adj_node) if depth_remaining >= 0: linked_pages = get_links(page) for messy_link in linked_pages: link_title = clean_title(messy_link.title.strip_code()) link = pywikibot.Page(SITE, link_title) if filter(link_title.startswith, ["File:", "Category:", "Wikipedia:", "Template:"]): continue language_regex = re.compile("^[a-zA-Z][a-zA-Z]:.*$") if language_regex.match(link_title): print "DEBUG: Rejecting language based title: " + link_title continue pages_to_crawl_next.append(link) time.sleep(25) print str(i) i += 1 pages_to_crawl = pages_to_crawl_next pages_to_crawl_next = [] depth_remaining -= 1 print "******* " + str(len(pages_to_link)) j = 0 for page in pages_to_link: page_title = clean_title(page.title()) page_node = GRAPHDB.get_indexed_node("TitleIndex", "title", page_title) links = get_links(page) for messy_link in links: link_title = clean_title(messy_link.title.strip_code()) if filter(link_title.startswith, ["File:", "Category:", "Wikipedia:", "Template:"]): continue adj_node = GRAPHDB.get_indexed_node("TitleIndex", "title", link_title) if adj_node: edge_batch.get_or_create_path(page_node, "links_to", adj_node) print j j += 1 edge_batch.submit()
def add_category_to_db(category): node = GRAPHDB.get_or_create_indexed_node("TitleIndex", "title", category, {"title": category}) node.add_labels("Category") return node