コード例 #1
0
ファイル: app.py プロジェクト: R4chel/RecommendationGraph
def create_graph(GRAPHDB):

    # Do we have a node that has a 'name' property, which is set to the value 'Neo'?
    # We've probably been here before.
    data, metadata = cypher.execute(GRAPHDB, "START n=node(*) where n.name='Neo' return n")
    if not data:
        # Create two nodes, one for us and one for you.
        # Make sure they both have 'name' properties with values.
        from_node, to_node = GRAPHDB.create({"name": "Neo"}, {"name": "you"})

        # create a 'loves' relationship from the 'from' node to the 'to' node
        GRAPHDB.create((from_node, "loves", to_node),)
コード例 #2
0
def saveToNeo4j(found_pages, found_links):
    print "++: saving to neo4j"
    title_index = GRAPHDB.get_or_create_index(neo4j.Node, "TitleIndex")
    i=0
    for link in found_links:
        pageA, pageB = link
        nodeA = title_index.get_or_create("title", pageA, {"title": pageA})
        nodeB = title_index.get_or_create("title", pageB, {"title": pageB})
        GRAPHDB.create((nodeA, "links_to", nodeB))
        if not i % 100:
            print i
        i += 1
コード例 #3
0
def add_color_to_db(title, labels=[], hex=None):
    if not hex:
        return add_page_to_db(title, labels)
    node = GRAPHDB.get_or_create_indexed_node("TitleIndex", "title", title, {"title": title, "hex": hex})
    for label in labels:
        node.add_labels(label)
    node.add_labels("Color")
    return node
コード例 #4
0
def saveToNeo4jBatch(found_pages, found_links):
    url_index = GRAPHDB.get_or_create_index(neo4j.Node, "UrlIndex")
    pageToNode = {}
    for page in found_pages:
        name = getNameFromLink(page)
        node = GRAPHDB.create({"name":name, "url":page})[0]
        # TODO: add labels based on infobox
        pageToNode[page] = node
    # save links
    i = 0
    batch = neo4j.WriteBatch(GRAPHDB)
    for link in found_links:
        pageA, pageB = link
        nodeA = pageToNode.get(pageA)
        nodeB = pageToNode.get(pageB)
        batch.get_or_create_path(nodeA, "links_to", nodeB)
        if not i % 100:
            print "i: " + str(i)
            batch.run()
            batch = neo4j.WriteBatch(GRAPHDB)
        i += 1
    batch.run()
    print "total num links created: " + str(i)
コード例 #5
0
 def handle_row_inner(inner_row):
     nodeB = inner_row[0]
     weight = random.random()
     print "w: " + str(weight)
     if nodeA != nodeB:
         GRAPHDB.create((nodeA, "RELEVANCY", nodeB, {"weight": weight}),)
コード例 #6
0
def add_page_to_db(title, labels=[]):
    node = GRAPHDB.get_or_create_indexed_node("TitleIndex", "title", title, {"title": title})
    for label in labels:
        node.add_labels(label)
#    node.add_labels("Page")
    return node
コード例 #7
0
def crawl_pages(pages_to_crawl, depth_remaining, color=False):
    pages_to_crawl_next = []
    pages_to_link = []
    depth_remaining -= 1

    edge_batch = neo4j.WriteBatch(GRAPHDB)
    while pages_to_crawl:
        i = 0
        for page in pages_to_crawl:
            if page in pages_to_link:
                continue
            pages_to_link.append(page)
            
            title = clean_title(page.title())
            infoboxes = get_infoboxes(page)

            if color:
                node = add_color_to_db(title, infoboxes, get_hex(page))
            else:
                node = add_page_to_db(title, infoboxes)

            categories = get_categories(page)
            for category in categories:
                adj_node = add_category_to_db(category)
                edge_batch.get_or_create_path(node, "has_category", adj_node)
            if depth_remaining >= 0:
                linked_pages = get_links(page)
                for messy_link in linked_pages:
                    link_title = clean_title(messy_link.title.strip_code())
                    link = pywikibot.Page(SITE, link_title)
                    if filter(link_title.startswith, ["File:", "Category:", "Wikipedia:", "Template:"]):
                        continue
                    language_regex = re.compile("^[a-zA-Z][a-zA-Z]:.*$")
                    if language_regex.match(link_title):
                        print "DEBUG: Rejecting language based title: " + link_title
                        continue
                    pages_to_crawl_next.append(link)
            time.sleep(25)
            print str(i)
            i += 1

        pages_to_crawl = pages_to_crawl_next
        pages_to_crawl_next = []
        depth_remaining -= 1


    print "******* " + str(len(pages_to_link))
    j = 0
    for page in pages_to_link:
        page_title = clean_title(page.title())
        page_node = GRAPHDB.get_indexed_node("TitleIndex", "title", page_title)
        links = get_links(page)

        for messy_link in links:
            link_title = clean_title(messy_link.title.strip_code())
            if filter(link_title.startswith, ["File:", "Category:", "Wikipedia:", "Template:"]):
                continue


            adj_node = GRAPHDB.get_indexed_node("TitleIndex", "title", link_title)
            if adj_node:
                edge_batch.get_or_create_path(page_node, "links_to", adj_node)
        print j
        j += 1
    edge_batch.submit()
コード例 #8
0
def add_category_to_db(category):
    node = GRAPHDB.get_or_create_indexed_node("TitleIndex", "title", category, {"title": category})
    node.add_labels("Category")
    return node