def export_graph_data():
    data_file = open(EXPORT_EDGES_FILE, "w")
    db = db_utils.getDBInstance()

    nodes = set([])
    data_file.write("Source\tTarget\n")
 
    pages = db.webpages.find()
    for page in pages:
        ilinks = page["incoming_links"]
        for link in ilinks:
            if link.startswith("javascript"):
                continue
            if page["url"].startswith("javascript"):
                continue
            link = link.replace(",", "")
            nodes.add(link)
            page["url"] = page["url"].replace(",", "")
            nodes.add(page["url"])
            data_file.write(("%s\t%s\n"%(link, page["url"])))



    nodes_file = open(EXPORT_NODES_FILE, "w")    

    nodes_file.write("Id\tLabel\n")
    for node in nodes:
        nodes_file.write("%s\t%s\n"%(node, utils.domainOf(node)))
    
    nodes_file.close() 
    data_file.close()
def crawl_manager(number_of_processes=12 ):
    db = db_utils.getDBInstance()
    seedStartURLsInDB(db, START_URLS)
    number_of_iterations = 10 
    while number_of_iterations:
        pool = Pool(number_of_processes)
        urls = getUnvisitedURLsFromDB(db, number_of_processes)
        result = pool.map(crawler.crawl_url, urls)
        
        number_of_iterations -= 1
def draw_graph3d(graph_colormap='winter', bgcolor = (1, 1, 1),
                 node_size=0.03,
                 edge_color=(0.8, 0.8, 0.8), edge_size=0.002,
                 text_size=0.008, text_color=(0, 0, 0)): 
    
    H = nx.Graph()
    
    db = db_utils.getDBInstance()
    nodes = db.webpages.find()
    edges = []
    for node in nodes:
        iurls = node["incoming_links"]
        for iurl in iurls:
            H.add_edge(iurl, node["url"])

    G= nx.convert_node_labels_to_integers(H)

    graph_pos=nx.spring_layout(G, dim=3)

    # numpy array of x,y,z positions in sorted node order
    xyz=np.array([graph_pos[v] for v in sorted(G)])

    # scalar colors
    scalars=np.array(G.nodes())+5
    mlab.figure(1, bgcolor=bgcolor)
    mlab.clf()

    pts = mlab.points3d(xyz[:,0], xyz[:,1], xyz[:,2],
                        scalars,
                        scale_factor=node_size,
                        scale_mode='none',
                        colormap=graph_colormap,
                        resolution=20)

    for i, (x, y, z) in enumerate(xyz):
        label = mlab.text(x, y, str(i), z=z,
                          width=text_size, name=str(i), color=text_color)
        label.property.shadow = True

    pts.mlab_source.dataset.lines = np.array(G.edges())
    tube = mlab.pipeline.tube(pts, tube_radius=edge_size)
    mlab.pipeline.surface(tube, color=edge_color)

    mlab.show() # interactive window
Beispiel #4
0
def crawl_url(url, headless=True, save_into_db=True):
    print "Crawling URL",url
    iurl_hash = utils.get_url_hash(url)
    update = {iurl_hash: url}

    db = db_utils.getDBInstance()  
    if regex_domain_match(db, url):
        print "Skipping: ",url
        
    db.crawl_queue.remove(update)
    url = utils.sanitize_url(url)
    url_hash = utils.get_url_hash(url)
    db_query = {'url_hash': url_hash}

    if headless:
        display = Display(visible=0, size=(800, 600))
        display.start()

    
    obj_in_db = db.webpages.find_one(db_query)
    
    webpage = None
    if not obj_in_db:
        webpage = WebPage(url)


    browser = webdriver.Chrome(desired_capabilities=capabilities)
    browser.set_page_load_timeout(30)
    #browser.implicitly_wait(5)
    try:
        print "Visiting page: ",url
        if not url.startswith("http"):
            raise Exception

        browser.get(url)
        #time.sleep(1)
    except Exception, e:
        print "Error Occured"
        browser.quit()
        print e
        return -1