Beispiel #1
0
def write_nodelinks_network_from_MS(nodes_links, filename, fileformat='gexf'):
    G = nx.DiGraph()
    for link in nodes_links:
        G.add_node(link.targetLRU, label=lru.lru_to_url(link.targetLRU))
        G.add_node(link.sourceLRU, label=lru.lru_to_url(link.sourceLRU))
        G.add_edge(link.sourceLRU, link.targetLRU, weight=link.weight)
    write_graph_in_format(G, filename, fileformat)
def write_nodelinks_network_from_MS(nodes_links, filename, fileformat='gexf') :
    G = nx.DiGraph()
    for link in nodes_links :
        G.add_node(link.targetLRU, label=lru.lru_to_url(link.targetLRU))
        G.add_node(link.sourceLRU, label=lru.lru_to_url(link.sourceLRU))
        G.add_edge(link.sourceLRU, link.targetLRU, weight=link.weight)
    write_graph_in_format(G, filename, fileformat)
Beispiel #3
0
def write_pages_network_from_mongo(pages, filename, fileformat='gexf'):
    G = nx.DiGraph()
    for page in pages:
        if "lrulinks" in page:
            G.add_node(page['lru'], label=page['url'])
            for index, lrulink in enumerate(page["lrulinks"]):
                G.add_node(lrulink, label=lru.lru_to_url(lrulink))
                G.add_edge(page['lru'], lrulink)
    write_graph_in_format(G, filename, fileformat)
def write_pages_network_from_mongo(pages, filename, fileformat='gexf') :
    G = nx.DiGraph()
    for page in pages :
        if "lrulinks" in page :
            G.add_node(page['lru'], label=page['url'])
            for index,lrulink in enumerate(page["lrulinks"]) :
                G.add_node(lrulink, label=lru.lru_to_url(lrulink))
                G.add_edge(page['lru'], lrulink)
    write_graph_in_format(G, filename, fileformat)
Beispiel #5
0
def generate_cache_from_pages_list(pageList,
                                   precision_limit=1,
                                   precision_exceptions=[],
                                   verbose=False):
    if verbose:
        print "### createCache"
    pages = {}
    links = {}
    original_link_number = 0
    nodes = {}
    for page_item in pageList:
        page_item["lru"] = lru.cleanLRU(page_item["lru"])
        is_full_precision = lru.isFullPrecision(page_item["lru"],
                                                precision_exceptions)
        lru_head = lru.getLRUHead(page_item["lru"], precision_exceptions)
        is_node = lru.isLRUNode(page_item["lru"],
                                precision_limit,
                                lru_head=lru_head)
        node_lru = page_item["lru"] if is_node else lru.getLRUNode(
            page_item["lru"], precision_limit, lru_head=lru_head)
        nodes[node_lru] = 1
        # Create index of crawled pages from queue
        if page_item["lru"] not in pages:
            pages[page_item["lru"]] = ms.PageItem(
                str(page_item["_id"]), page_item["url"].encode('utf8'),
                page_item["lru"].encode('utf8'), str(page_item["timestamp"]),
                int(page_item["status"]), int(page_item["depth"]),
                str(page_item["error"]), ['CRAWL'], is_full_precision, is_node,
                {})
        else:
            if 'CRAWL' not in pages[page_item["lru"]].sourceSet:
                pages[page_item["lru"]].sourceSet.append('CRAWL')
            pages[page_item["lru"]].depth = max(
                0, min(pages[page_item["lru"]].depth, int(page_item["depth"])))
        # Add to index linked pages and index all links between nodes
        if "lrulinks" in page_item:
            for index, lrulink in enumerate(page_item["lrulinks"]):
                lrulink = lru.cleanLRU(lrulink)
                is_full_precision = lru.isFullPrecision(
                    lrulink, precision_exceptions)
                lru_head = lru.getLRUHead(lrulink, precision_exceptions)
                is_node = lru.isLRUNode(lrulink,
                                        precision_limit,
                                        lru_head=lru_head)
                target_node = lrulink if is_node else lru.getLRUNode(
                    lrulink, precision_limit, lru_head=lru_head)
                nodes[target_node] = 1
                original_link_number += 1
                # check False {} errorcode
                if lrulink not in pages:
                    pages[lrulink] = ms.PageItem(
                        str(page_item["_id"]) + "_" + str(index),
                        lru.lru_to_url(lrulink).encode('utf8'),
                        lrulink.encode('utf8'), str(page_item["timestamp"]),
                        None,
                        int(page_item["depth"]) + 1, None, ['LINK'],
                        is_full_precision, is_node, {})
                elif 'LINK' not in pages[lrulink].sourceSet:
                    pages[lrulink].sourceSet.append('LINK')
                links[(node_lru,
                       target_node)] = links[(node_lru, target_node)] + 1 if (
                           node_lru, target_node) in links else 1
    if verbose:
        print str(len(pages)) + " unique pages ; " + str(
            original_link_number) + " links ; " + str(len(
                links.values())) + " unique links / identified " + str(
                    len(nodes)) + " nodes"
    return (pages, [(source, target, weight)
                    for (source, target), weight in links.iteritems()])
def generate_cache_from_pages_list(pageList, precision_limit = 1, precision_exceptions = [], verbose = False) :
    if verbose :
        print "### createCache"
    pages = {}
    links = {}
    original_link_number = 0
    nodes = {}
    for page_item in pageList : 
        page_item["lru"] = lru.cleanLRU(page_item["lru"])
        is_full_precision = lru.isFullPrecision(page_item["lru"], precision_exceptions)
        lru_head = lru.getLRUHead(page_item["lru"], precision_exceptions)
        is_node = lru.isLRUNode(page_item["lru"], precision_limit, lru_head=lru_head)
        node_lru = page_item["lru"] if is_node else lru.getLRUNode(page_item["lru"], precision_limit, lru_head=lru_head)
        nodes[node_lru] = 1
        # Create index of crawled pages from queue
        if page_item["lru"] not in pages:
            pages[page_item["lru"]] = ms.PageItem(str(page_item["_id"]), page_item["url"].encode('utf8'), page_item["lru"].encode('utf8'), str(page_item["timestamp"]), int(page_item["status"]), int(page_item["depth"]), str(page_item["error"]), ['CRAWL'], is_full_precision, is_node, {})
        else:
            if 'CRAWL' not in pages[page_item["lru"]].sourceSet:
                pages[page_item["lru"]].sourceSet.append('CRAWL')
            pages[page_item["lru"]].depth = max(0, min(pages[page_item["lru"]].depth, int(page_item["depth"])))
        # Add to index linked pages and index all links between nodes
        if "lrulinks" in page_item:
            for index,lrulink in enumerate(page_item["lrulinks"]) :
                lrulink = lru.cleanLRU(lrulink)
                is_full_precision = lru.isFullPrecision(lrulink, precision_exceptions)
                lru_head = lru.getLRUHead(lrulink, precision_exceptions)
                is_node = lru.isLRUNode(lrulink, precision_limit, lru_head=lru_head)
                target_node = lrulink if is_node else lru.getLRUNode(lrulink, precision_limit, lru_head=lru_head)
                nodes[target_node] = 1
                original_link_number += 1
# check False {} errorcode
                if lrulink not in pages:
                    pages[lrulink] = ms.PageItem(str(page_item["_id"])+"_"+str(index), lru.lru_to_url(lrulink).encode('utf8'), lrulink.encode('utf8'), str(page_item["timestamp"]), None, int(page_item["depth"])+1, None, ['LINK'], is_full_precision, is_node, {})
                elif 'LINK' not in pages[lrulink].sourceSet:
                    pages[lrulink].sourceSet.append('LINK')
                links[(node_lru,target_node)] = links[(node_lru,target_node)] + 1 if (node_lru,target_node) in links else 1
    if verbose:
        print str(len(pages))+" unique pages ; "+str(original_link_number)+" links ; "+str(len(links.values()))+" unique links / identified "+str(len(nodes))+" nodes"
    return (pages, [(source, target, weight) for (source,target),weight in links.iteritems()])