def export_graph_data(): data_file = open(EXPORT_EDGES_FILE, "w") db = db_utils.getDBInstance() nodes = set([]) data_file.write("Source\tTarget\n") pages = db.webpages.find() for page in pages: ilinks = page["incoming_links"] for link in ilinks: if link.startswith("javascript"): continue if page["url"].startswith("javascript"): continue link = link.replace(",", "") nodes.add(link) page["url"] = page["url"].replace(",", "") nodes.add(page["url"]) data_file.write(("%s\t%s\n"%(link, page["url"]))) nodes_file = open(EXPORT_NODES_FILE, "w") nodes_file.write("Id\tLabel\n") for node in nodes: nodes_file.write("%s\t%s\n"%(node, utils.domainOf(node))) nodes_file.close() data_file.close()
def crawl_manager(number_of_processes=12 ): db = db_utils.getDBInstance() seedStartURLsInDB(db, START_URLS) number_of_iterations = 10 while number_of_iterations: pool = Pool(number_of_processes) urls = getUnvisitedURLsFromDB(db, number_of_processes) result = pool.map(crawler.crawl_url, urls) number_of_iterations -= 1
def draw_graph3d(graph_colormap='winter', bgcolor = (1, 1, 1), node_size=0.03, edge_color=(0.8, 0.8, 0.8), edge_size=0.002, text_size=0.008, text_color=(0, 0, 0)): H = nx.Graph() db = db_utils.getDBInstance() nodes = db.webpages.find() edges = [] for node in nodes: iurls = node["incoming_links"] for iurl in iurls: H.add_edge(iurl, node["url"]) G= nx.convert_node_labels_to_integers(H) graph_pos=nx.spring_layout(G, dim=3) # numpy array of x,y,z positions in sorted node order xyz=np.array([graph_pos[v] for v in sorted(G)]) # scalar colors scalars=np.array(G.nodes())+5 mlab.figure(1, bgcolor=bgcolor) mlab.clf() pts = mlab.points3d(xyz[:,0], xyz[:,1], xyz[:,2], scalars, scale_factor=node_size, scale_mode='none', colormap=graph_colormap, resolution=20) for i, (x, y, z) in enumerate(xyz): label = mlab.text(x, y, str(i), z=z, width=text_size, name=str(i), color=text_color) label.property.shadow = True pts.mlab_source.dataset.lines = np.array(G.edges()) tube = mlab.pipeline.tube(pts, tube_radius=edge_size) mlab.pipeline.surface(tube, color=edge_color) mlab.show() # interactive window
def crawl_url(url, headless=True, save_into_db=True): print "Crawling URL",url iurl_hash = utils.get_url_hash(url) update = {iurl_hash: url} db = db_utils.getDBInstance() if regex_domain_match(db, url): print "Skipping: ",url db.crawl_queue.remove(update) url = utils.sanitize_url(url) url_hash = utils.get_url_hash(url) db_query = {'url_hash': url_hash} if headless: display = Display(visible=0, size=(800, 600)) display.start() obj_in_db = db.webpages.find_one(db_query) webpage = None if not obj_in_db: webpage = WebPage(url) browser = webdriver.Chrome(desired_capabilities=capabilities) browser.set_page_load_timeout(30) #browser.implicitly_wait(5) try: print "Visiting page: ",url if not url.startswith("http"): raise Exception browser.get(url) #time.sleep(1) except Exception, e: print "Error Occured" browser.quit() print e return -1