Example #1
0
 def __init__(self, *sc_queues, **kwargs): 
     Process.__init__(self)
     SCSpider.Num += 1
     self.pnum = SCSpider.Num 
     if sc_queues:
         self.scqs = sc_queues
     else:
         self.scqs = list() 
     self.due = DUEUnit()
     self.link_extro = LinkExtractorTPool(feed=False)
     #The self.headers keeps the HTTP headers Agent information for Masking the Crawler
     self.headers = { 'User-Agent' : kwargs.pop("spider_spoof_id", None) }
     if self.headers['User-Agent'] == None: 
         self.headers = { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux x86_64; en-GB; rv:1.9.1.9)' }
     self.kill_evt = kwargs.pop("kill_evt", multiprocessing.Event().clear())
     self.urls_l = [ kwargs.pop("seed", None) ] 
     self.xtrees_q = kwargs.pop("xtrees_q", Queue()) #Use external Queue only for Interprocess Communication if any
     #ext_due_q is a Queue of URL Links for an External DUE-Unit 
     self.ext_url_q = kwargs.pop("ext_due_q", None)
     self.base_url_drop_none = kwargs.pop("base_url_drop_none", True)
     #urls_number_stop : Stop in a Default Values (if none given from user) for Politeness and because there is no point to have more samples of this site (I think)
     self.urls_number = kwargs.pop("urls_number_stop", 1000)  
     self.webpg_vect_tu = kwargs.pop("webpg_vect_tu", None)
     self.save_path = kwargs.pop("save_path", None)
     if self.save_path and not os.path.isdir(self.save_path):
         os.mkdir(self.save_path)
     self.file_counter = 0
Example #2
0
class SCSpider(Process): 
    """SCSpider:"""    
    Num = 0    
    
    def __init__(self, *sc_queues, **kwargs): 
        Process.__init__(self)
        SCSpider.Num += 1
        self.pnum = SCSpider.Num 
        if sc_queues:
            self.scqs = sc_queues
        else:
            self.scqs = list() 
        self.due = DUEUnit()
        self.link_extro = LinkExtractorTPool(feed=False)
        #The self.headers keeps the HTTP headers Agent information for Masking the Crawler
        self.headers = { 'User-Agent' : kwargs.pop("spider_spoof_id", None) }
        if self.headers['User-Agent'] == None: 
            self.headers = { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux x86_64; en-GB; rv:1.9.1.9)' }
        self.kill_evt = kwargs.pop("kill_evt", multiprocessing.Event().clear())
        self.urls_l = [ kwargs.pop("seed", None) ] 
        self.xtrees_q = kwargs.pop("xtrees_q", Queue()) #Use external Queue only for Interprocess Communication if any
        #ext_due_q is a Queue of URL Links for an External DUE-Unit 
        self.ext_url_q = kwargs.pop("ext_due_q", None)
        self.base_url_drop_none = kwargs.pop("base_url_drop_none", True)
        #urls_number_stop : Stop in a Default Values (if none given from user) for Politeness and because there is no point to have more samples of this site (I think)
        self.urls_number = kwargs.pop("urls_number_stop", 1000)  
        self.webpg_vect_tu = kwargs.pop("webpg_vect_tu", None)
        self.save_path = kwargs.pop("save_path", None)
        if self.save_path and not os.path.isdir(self.save_path):
            os.mkdir(self.save_path)
        self.file_counter = 0
        
    def run(self):
        """SCSpider's main function"""
        #Use the netloc (network locator) of the seed url as the base URL that this spider is working on
        url = self.urls_l[0]       
        self.due.setBase(url)
        #Define a process 
        n_gpool = eventlet.GreenPool(1)
        s_gpool = eventlet.GreenPool(10000)
        #ppool = multiprocessing.Pool(2)
        #green_save_p = eventlet.GreenPool(1000)
        #A thread is constantly checking the DUE seen dictionary is big enough to be saved on disk
        disk_keeper_thrd = Thread(target=self.savedue)
        disk_keeper_thrd.start()
        #From this line and below the loop this Process is starting 
        scanned_urls = 1 #Counter for the URLS that have been Followed by the Crawler - SHOULD BE HERE
        while True:
            #Termination Condition of this spider: They can be external signal(s), user defined conditions or when spider didn't finds any URLs related to the Crawling target
            #Terminate - External Signal or Functional problems occur while crawling
            if self.kill_evt.is_set():
                print("SCSpider Process (PID = %s - PCN = %s): Terminated" % (self.pid, self.pnum))
                SCSpider.Num -= 1
                break
            #Terminate - No more Urls to follow
            if self.urls_l == []:
                print("SCSpider Process (PID = %s - PCN = %s): Terminated - No more URL links to follow" % (self.pid, self.pnum))
                SCSpider.Num -= 1
                self.due.savetofile()
                self.kill_evt.set()
                break
            #Get one of the URLs which an other SCSpider has left to the ext_due_p and append it to the urls_l
            if self.ext_url_q != None:
                ext_url = self.ext_url_q.get(self.due.base_url['hashkey'],2)
                if ext_url:
                    self.urls_l.append(ext_url)
            tmp_urls_l = list() #SHOULD BE HERE
            #Start Processing WebPages (in fact sockets to them) which a Pool of GreenThreads is harvesting Asynchronously  
            for xhtml in n_gpool.imap(self.fetchsrc, self.urls_l):
            #for xhtml in ppool.imap_unordered(fetchsrc, self.urls_l, 2):
                #Feed the link Extractor with the new-coming XHTML(s) if not empty
                if xhtml[0]:
                    #Expand the xhtml tree dictionary with some date for later process
                    xhtml_d = dict()
                    xhtml_d['xhtml_s'] = xhtml[0]       
                    xhtml_d['charset'] = xhtml[1]
                    xhtml_d['url_req'] = xhtml[2]
                    xhtml_d['url_resp'] = xhtml[3]
                    prsd_url = urlparse(xhtml[2])
                    xhtml_d['netloc'] = prsd_url.netloc
                    xhtml_d['base_url'] = str(prsd_url.scheme + "://" + prsd_url.netloc) 
                    s_gpool.spawn_n(self.save_xhtml, xhtml_d)
                    #Terminate - Some User condition reached
                    if scanned_urls >= self.urls_number:
                        print("SCSpider Process (PID = %s - PCN = %s): Terminated - User Condition: Stop On %d Pages (%d have been followed) "\
                              % (self.pid, self.pnum, self.urls_number, scanned_urls))
                        SCSpider.Num -= 1
                        self.due.savetofile()
                        self.kill_evt.set()
                        break
                    else:
                        print("SAVED URLS: %d" % scanned_urls)
                        scanned_urls += 1
                    #Extract URL for next Sites
                    ##Feed LinkExtractor now - get later    
                    #self.link_extro.feed(xhtml_d)
                    ##Force LinkExtractor to return immediately     
                    for link in self.link_extro.sites_links(xhtml_d):
                        parsed_u = urlparse(link)
                        prsd_url = str(parsed_u.scheme + "://" + parsed_u.netloc)
                        baseurl = str(self.due.base_url['scheme'] + "://" +  self.due.base_url['netloc'])
                        if  prsd_url == baseurl:
                            seen = self.ust(link)
                            if not seen:                   
                                tmp_urls_l.append(link)
                            #else: means discarding this previously seen URL links
                            else:
                                pass
                else:
                    print("SPIDER %d of %d BASE %s" % (self.pnum, SCSpider.Num, self.due.base_url['netloc']))
                    print("Empty Page : %s" % xhtml[1])
                    
            #Get the Link lists form linkextractor.sites_links_iter() iterator - In case feed() or l_feed() have been used
            #for links in self.link_extro.sites_links_iter():
            #    for link in links:
            #        parsed_u = urlparse(link)
            #        prsd_url = str(parsed_u.scheme + "://" + parsed_u.netloc)
            #        baseurl = str(self.due.base_url['scheme'] + "://" +  self.due.base_url['netloc'])
            #        if  prsd_url == baseurl:
            #            seen = self.ust(link)
            #            if not seen:
            #                print "Not SEEN!"                    
            #                tmp_urls_l.append(link)
                        #else: means discarding this previously seen URL links
            #            else:
            #                pass
                            #If the Base_urls is not the one this SCSpider is working on, try to pass it the other SCSpider Processes
                            #if self.ext_url_q != None:
            #                self.ext_url_q.put(link)
                            #elif self.base_url_drop_none:
                            #    seen = self.ust(link)
                            #    if seen:
                            #        tmp_urls_l.append(link)
            #Now give the new URLs List back to the Fetcher GreenThreads
            del self.urls_l #for Preventing Memory Leakage remove it if has no effect but delay of the inevitable
            #print len(self.urls_l) 
            self.urls_l = tmp_urls_l
            print len(self.urls_l)
        #If this Process has to be terminated wait for the Disk_keeper Thread to finish its job and join
        self.due.acquire()
        #WAKE-UP disk_keeper thread in case still waiting
        self.due.notify_all()
        self.due.release()
        #self.link_extro.close()
        disk_keeper_thrd.join()
        
    def fetch(self, url):
        #print("IN FETCH: " + str(self.headers))
        #print(url)
        rq = urllib2.Request(url, headers=self.headers)
        socket = urllib2.urlopen(rq)
        return socket
    
    def fetchsrc(self, url_req):
        #time.sleep(5)
        htmlsrc = None
        socket = None
        charset = None
        url_resp = None
        try:
            rq = urllib2.Request(url_req, headers=self.headers)
            socket = urllib2.urlopen(rq)
            htmlsrc = socket.read()
            charset = socket.info().getparam('charset')
            url_resp = socket.geturl()
            socket.close()
        except Exception as e:
            print("FETCH ERROR(urllib2): %s - URL: %s" % (e, url_req))
        #Return a tuple of the HTML source, the character encoding of this source, and its URL 
        return (htmlsrc, charset, url_req, url_resp)
    
    def save_xhtml(self, xhtml_d):
        assert os.path.isdir(self.save_path)
        ret_signal = True 
        self.file_counter += 1
        try:
            try:
                file = self.save_path + str(xhtml_d['netloc']) + "." + str(self.file_counter) + ".html"
                f = os.open( file, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            except Exception as e:
                #Stop the Process First and then raise the Exception with some extra info
                self.kill_evt.set()
                raise Exception("SCSpider error while Creating file - Error: %s" % e)
            #Place a file-object wrapper around the file Descriptor
            fobj = os.fdopen(f, "w", 1)       
            #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding
            #if xhtml_d['charset']:
            #    fenc = codecs.EncodedFile(fobj, xhtml_d['charset'])
            #else:
            fenc = fobj
        except Exception as e:
            print("SCSpider error while Creating file - Error: %s" % e)
            #Return None for the Spider to know that some error occurred for deciding what to do with it 
            ret_signal = None 
        else:
            try:
                fenc.write( xhtml_d['xhtml_s'] ) # Write the source to file
            except Exception as e:
                print("SCSpider Error while Writing file - Error: %s" % e)
                ret_signal = None
        finally:
            fenc.close()
            
        return ret_signal
    
    def savedue(self):
        while not self.kill_evt.is_set():
            self.due.acquire()
            while self.due.seen_len() < 100000000:
                #This will force the thread to stop in case a global stop signal is given
                if self.kill_evt.is_set():
                    return
                #In case Seen Dictionary is still small Wait (i.e. Sleep)
                self.due.wait()
            if not self.due.savetofile():
                print("SCSPIDER: DUE FILE NOT SAVED - HALT")
                self.kill_evt.set()
                return
            #self.due.notify_all()
            self.due.release()
            
    def ust(self, link):
        self.due.acquire()
        seen = self.due.ust(link)
        self.due.notify_all()
        self.due.release()
        return seen
Example #3
0
from dueunits import DUEUnit

due = DUEUnit()

due.setBase("http://www.unit-test.org")

News_Seeds = [
    "http://www.bbc.co.uk",
    "http://edition.cnn.com",
    "http://www.bloomberg.com",
    "http://www.ted.com/talks/tags",
    "http://www.foxnews.com",
    "http://www.time.com/time",
    "http://www.nationalgeographic.com",
    "http://www.bbcfocusmagazine.com",
    "http://www.pcmag.com",
    "http://www.drdobbs.com",
    "http://news.google.com",
    "http://www.channelnewsasia.com",
    "http://health.usnews.com",
    "http://www.zdnet.co.uk",
    "http://soccernet.espn.go.com",
    "http://mlb.mlb.com",
    "http://www.dallasnews.com",
    "http://www.theaustralian.com.au",
    "http://www.nydailynews.com",
]

due.ust(News_Seeds)

# Seed = "http://www.bbc.co.uk"