def run_crawler(current_pages, min_images): global all_time_visited q = Queue.Queue() visited = set() for page in current_pages: q.put(page) visited.add(page) images = [] while len(images) < min_images: page = q.get() visited.add(page) all_time_visited.add(page) links = Fetcher.fetch_links(page) if links is None: continue for l in links[:10]: print(l) signal.alarm(10) try: if l not in visited and l not in all_time_visited: q.put(l) images.extend(get_images(l)) print("\n \n Current number of images") print(len(images)) print("I have been extended") except TimeoutException: continue # continue the for loop if function A takes more than 5 second else: # Reset the alarm signal.alarm(0) print('$$$$$$$$$$$$$ret val', q.queue, '\n\n', images) return list(q.queue), images
G = nx.DiGraph() G_undir = nx.Graph() root = "http://www.caltech.edu/" bfs_queue.put(root) G.add_node(root) G_undir.add_node(root) depth = 0 outcount = {} while (bfs_queue.empty() == False and depth < 2001): cur_link = bfs_queue.get() links = fetcher.fetch_links(cur_link) if links != None: #iterate through children for link in links: if "caltech.edu" in link and link not in visited_links: time.sleep(0.5) # update graph and visited_links array bfs_queue.put(link) visited_links.append(link) # add to dictionary, as well as how many hyperlinks it has try: hyperlinks = fetcher.fetch_links(link) outcount[link] = len(hyperlinks)
print current sys.stdout.write('\r') # the exact output you're looking for: sys.stdout.write("[%-40s] %f%%" % ('='*(count/50), count/20)) sys.stdout.write('\r') sys.stdout.flush() if current in dictionary_out.keys(): continue # already crawled before G.add_node(current) try: addin=fetcher.fetch_links(current) # the new outgoing urls except: print "*** Something bad happened (404)***\n" continue try: for item in addin: # screening of the outgoing urls if "jstor" in item or "ieeexplore" in item or "onlinelibrary" in item or ".pdf" in item: continue if not "caltech.edu" in item: continue if item in dictionary_in.keys(): dictionary_in[item]+=1 else: dictionary_in[item]=1
def crawlerRun(threadID, sleeptime): global poolLock, urlPool, urlFound, activeThreads, poolOpen, proxyLock, proxyInd, proxyList, poolLen, foundLen # print "Thread " + str(threadID) + " Started" sys.stdout.flush() MAX_RESULTS = 100000 #20010 # How many results? Finite execution, rather than crawling entire reachable(URL_seeds)' POOL_LIMIT = 100000 output = 10 # file to save structure in try: f = open('structure.' + str(threadID) + '.dat', 'w') except IOError: print "Unable to open " + "structure." + str(threadID) + \ ".dat for writing. Thread " + str(threadID) + "exiting." sys.stdout.flush() exit() nlookups = 150 myLink = None while foundLen < MAX_RESULTS: """ Changes url periodically to avoid lookup limit """ if nlookups >= 20: # print "\t\t\t\tThread " + str(threadID) + " aquiring new proxy." sys.stdout.flush() proxyLock.acquire() myLink = changeURL() nlookups = 0 myIP = myLink[0] myPort = myLink[1] proxyLock.release() # print "\t\t\t\tUsing " + myIP + ":" + str(myPort) while poolLen == 0: # print "\t\t\t\tThread " + str(threadID) + " unable to retrieve user from pool." \ # + " Pausing for " + str(sleeptime) + " sec." # sys.stdout.flush() time.sleep(sleeptime) poolLen = len(urlPool) poolLock.acquire() if poolLen > 0: user = urlPool.pop(0) # fetch next page (FIFO -> Breath First) else: continue poolLock.release() followers = fetch_links(user, myIP, myPort) nlookups += 1 if followers == None: poolLock.acquire() urlPool.insert(0, user) poolLock.release() proxyLock.acquire() myLink = changeURL() proxyLock.release() # print '\t\t\t\tProfile ' + str(user) + ' is busy. Absorbing back into pool.' sys.stdout.flush() continue try: int(followers[0]) except ValueError: poolLock.acquire() urlPool.append(user) poolLock.release() # print followers proxyLock.acquire() myLink = changeURL() proxyLock.release() continue urlFound.append(user) foundLen += 1 if (not (followers == None) and len(followers) > 0 and not (followers[0] == '')): new_pages = [] # Add unencountered pages to queue for ids in followers: if not (ids in urlPool or ids in urlFound): new_pages.append(ids) writeUser(f, user, followers) if poolOpen: poolLock.acquire() urlPool.extend(new_pages) # add pages to queue poolLock.release() # foundLen = len(urlFound) # Print progress if ((foundLen % output) == 0 and foundLen < MAX_RESULTS): poolLen = len(urlPool) print "Progress: %d pages crawled. %d users in pool." % (foundLen, poolLen) sys.stdout.flush() # Closes url pool if max size is reached. Prevents slow-down of crawl if poolOpen: poolLen = len(urlPool) if poolLen > POOL_LIMIT and poolOpen: print "\t\t\t\tMax URL Pool size reached! Closing Pool..." sys.stdout.flush() poolOpen = False f.close() # Output results print "Thread " + str(threadID) + " Finished! " + \ str(activeThreads - 1) + " Threads Running." sys.stdout.flush() activeThreads -= 1
urlFound = [] if not len(sys.argv[1:]) == 2: print usage exit() try: seedID = sys.argv[1] nThreads = int(sys.argv[2]) except ValueError: print usage exit() # Tests the seedID print "Beginning crawl at user ID " + seedID followers = fetch_links(seedID, proxyList[0][0], proxyList[1][0]) # print followers if followers == None: print usage print "Unable to open seedID. Twitter may be busy.\n\n" exit() try: f = open('structure.' + str(nThreads) + '.dat', 'w') writeUser(f, seedID, followers) f.close() except IOError: print "Unable to open " + "structure." + str(nThreads) + \ ".dat for writing. Crawler exiting." sys.stdout.flush() exit()
def crawlerRun(threadID, sleeptime): global poolLock, urlPool, urlFound, activeThreads, poolOpen, proxyLock, proxyInd, proxyList, poolLen, foundLen # print "Thread " + str(threadID) + " Started" sys.stdout.flush() MAX_RESULTS = 100000 #20010 # How many results? Finite execution, rather than crawling entire reachable(URL_seeds)' POOL_LIMIT = 100000 output = 10 # file to save structure in try : f = open('structure.' + str(threadID) + '.dat','w') except IOError : print "Unable to open " + "structure." + str(threadID) + \ ".dat for writing. Thread " + str(threadID) + "exiting." sys.stdout.flush() exit() nlookups = 150 myLink = None while foundLen < MAX_RESULTS: """ Changes url periodically to avoid lookup limit """ if nlookups >= 20: # print "\t\t\t\tThread " + str(threadID) + " aquiring new proxy." sys.stdout.flush() proxyLock.acquire() myLink = changeURL() nlookups = 0 myIP = myLink[0] myPort = myLink[1] proxyLock.release() # print "\t\t\t\tUsing " + myIP + ":" + str(myPort) while poolLen == 0 : # print "\t\t\t\tThread " + str(threadID) + " unable to retrieve user from pool." \ # + " Pausing for " + str(sleeptime) + " sec." # sys.stdout.flush() time.sleep(sleeptime) poolLen = len(urlPool) poolLock.acquire() if poolLen > 0: user = urlPool.pop(0) # fetch next page (FIFO -> Breath First) else: continue poolLock.release() followers = fetch_links(user, myIP, myPort) nlookups += 1 if followers == None : poolLock.acquire() urlPool.insert(0, user) poolLock.release() proxyLock.acquire() myLink = changeURL() proxyLock.release() # print '\t\t\t\tProfile ' + str(user) + ' is busy. Absorbing back into pool.' sys.stdout.flush() continue try: int(followers[0]) except ValueError: poolLock.acquire() urlPool.append(user) poolLock.release() # print followers proxyLock.acquire() myLink = changeURL() proxyLock.release() continue urlFound.append(user) foundLen += 1 if (not (followers == None) and len(followers) > 0 and not (followers[0] == '')): new_pages = [] # Add unencountered pages to queue for ids in followers : if not (ids in urlPool or ids in urlFound) : new_pages.append(ids) writeUser(f, user, followers) if poolOpen: poolLock.acquire() urlPool.extend(new_pages) # add pages to queue poolLock.release() # foundLen = len(urlFound) # Print progress if ((foundLen % output) == 0 and foundLen < MAX_RESULTS): poolLen = len(urlPool) print "Progress: %d pages crawled. %d users in pool." % (foundLen, poolLen) sys.stdout.flush() # Closes url pool if max size is reached. Prevents slow-down of crawl if poolOpen : poolLen = len(urlPool) if poolLen > POOL_LIMIT and poolOpen: print "\t\t\t\tMax URL Pool size reached! Closing Pool..." sys.stdout.flush() poolOpen = False f.close() # Output results print "Thread " + str(threadID) + " Finished! " + \ str(activeThreads - 1) + " Threads Running." sys.stdout.flush() activeThreads -= 1
urlFound = [] if not len(sys.argv[1:]) == 2 : print usage exit() try: seedID = sys.argv[1] nThreads = int(sys.argv[2]) except ValueError: print usage exit() # Tests the seedID print "Beginning crawl at user ID " + seedID followers = fetch_links(seedID, proxyList[0][0], proxyList[1][0]) # print followers if followers == None: print usage print "Unable to open seedID. Twitter may be busy.\n\n" exit() try : f = open('structure.' + str(nThreads) + '.dat','w') writeUser(f, seedID, followers) f.close() except IOError : print "Unable to open " + "structure." + str(nThreads) + \ ".dat for writing. Crawler exiting." sys.stdout.flush()