Esempio n. 1
0
 def __processBaseURL(self):
     '''
     Private member function used for Processing the base URL.
     
     Process the base URL and extract the links from them and create URLLinks object for each links extracted. 
     Each of these objects are pushed as the child entry of the Main URLLinks object created for the Base URL. 
     
     These child objects are later accessed in a loop and processed further to check for their validity, depending 
     on the depth to which they belong. 
     '''
     ts = time.time()
     handle = self.__getDataFromURL(self.__dict__['_url'])
     ted = time.time()
     dlTime = ted - ts
     if ( self.__checkIfError(handle)):
         if ( handle[0] == 'HTTPError'):
             eCode = ErrorCodes(int(handle[1]))
             einfo = eCode.getError()[1]
         else:
             einfo = handle[1]
         urlObject = URLLinks(self.__dict__['_url'], None, self.__dict__['_url'], None, isProcessed=True, isBroken=True, 
                              size='<Unknown>', dlTime=dlTime, checkTime=dlTime, lastModified='<Unknwon>', info=einfo,status=handle[0] + ' : ' + handle[1], lType='<Unknwon>')
         self.__printError(handle[0] + ' : ' + handle[1] + ' : ' + einfo)
         self.__raiseError(handle, self.__dict__['_url'])
         return urlObject
     else:
         ts = time.time()
         htmlData = urllib2.urlopen(self.__dict__['_url'])
         ted = time.time()
         data = etree.HTML(htmlData.read())
         dlTime  =   ted - ts
         title = self.__getURLTitle(data)
         links = self.__links(data)
         (lTtype, lastChagned, size) = self.__getURLInfo(handle)
         status = 'OK'
         urlObj = URLLinks(self.__dict__['_url'], title, self.__dict__['_url'], title, isProcessed=True, isBroken=False, size=size, dlTime=dlTime, 
                           lastModified=lastChagned, info='Successfully Processed', status=status, lType=lTtype)
         
         for link in links:
             cLink = str(link.attrib['href']).lstrip().rstrip()
             if ( cLink.startswith('#') or cLink.startswith('.') or cLink.startswith('..') or self.__dict__['_url'] not in cLink):
                 cLink = urlparse.urljoin(self.__dict__['_url'], cLink)
             
             if ( self.__dict__['_url'] in cLink):
                 cTitle = link.text
                 temp = URLLinks(self.__dict__['_url'], title, cLink, cTitle)
                 urlObj.addChild(temp)
         te = time.time()
         cTime = te - ts
         urlObj.setCheckTime(cTime)
         Deadcheck.__levelBasedLinks[0] = []
         Deadcheck.__levelBasedLinks[0].append(urlObj)
Esempio n. 2
0
 def __analyze(self, url):
     ts = time.time()
     handle = self.__getDataFromURL(url)
     ted = time.time()
     dlTime = ted - ts
     if ( self.__checkIfError(handle)):
         if ( handle[0] == 'HTTPError'):
             eCode = ErrorCodes(int(handle[1]))
             einfo = eCode.getError()[1]
         else:
             einfo = handle[1]
         urlObject = URLLinks(url, None, url, None, isProcessed=True, isBroken=True, 
                              size='<Unknown>', dlTime=dlTime, checkTime=dlTime, lastModified='<Unknwon>', info=einfo,status=handle[0] + ' : ' + handle[1], lType='<Unknwon>')
         return urlObject
     else:
         ts = time.time()
         htmlData = urllib2.urlopen(url)
         ted = time.time()
         data = etree.HTML(htmlData.read())
         dlTime  =   ted - ts
         title = self.__getURLTitle(data)
         links = self.__links(data)
         (lTtype, lastChagned, size) = self.__getURLInfo(handle)
         status = 'OK'
         urlObj = URLLinks(url, title, url, title, isProcessed=True, isBroken=False, size=size, dlTime=dlTime, 
                           lastModified=lastChagned, info='Successfully Processed', status=status, lType=lTtype)
         for link in links:
             cLink = str(link.attrib['href']).lstrip().rstrip()
             if ( cLink.startswith('#') or cLink.startswith('.') or cLink.startswith('..') or url not in cLink):
                 cLink = urlparse.urljoin(url, cLink)
             
             if ( urlparse.urlparse(url).netloc in cLink):
                 cTitle = link.text
                 temp = URLLinks(url, title, cLink, cTitle)
                 urlObj.addChild(temp)
         te = time.time()
         cTime = te - ts
         urlObj.setCheckTime(cTime)
         
         return urlObj
Esempio n. 3
0
 def process(self):
     '''
     Method that will be called using the Deadcheck object from the Main script to instruct the 
     module to process the links based on the depth to which they belong to. 
     
     Each of the link is extracted from the childURL list that belongs to the parent URLLinks object and 
     processed after checking for exemptions. 
     
     Based on the processing, the apropriate parameters and values are set using the set method available in the
     URLLinks class. 
     
     Each page being processed has its own list of the Child URL that are extracted and pushed into an array. 
     
     These list of URLs are processed during the next depth / level value. 
     
     '''
     self.__loadExempt()
     if ( self.get_depth() == 0 ):
         self.__analyze() 
     else:
         for level in range(self.get_depth()+1):
             Deadcheck.__levelBasedLinks[level+1] = []
             for vobj in self.getAll()[level]:
                 for obj in vobj.getChildren():
                     t1 = time.time()
                     (url, title) = obj.get()
                     #if ( not Deadcheck.__ProcessedLinks.has_key(url) and not self.__checkExempt(url) and 'javascript' not in url.lower()):
                     if ( not Deadcheck.__ProcessedLinks.has_key(url) and not self.__checkExempt(url) ):
                         Deadcheck.__ProcessedLinks[url] = 1
                         # Process javascript:openWindow type URL to extract necessary links. 
                         self.__printMessage("Processing Link : " + url);
                         if ( 'javascript' in url.lower()):
                             url = self.__cleanupJavaScript(url)
                             
                         ts = time.time()
                         handle = self.__getDataFromURL(url)
                         ted = time.time()
                         if ( self.__checkIfError(handle)):
                             if ( handle[0] == 'HTTPError'):
                                 eCode = ErrorCodes(int(handle[1]))
                                 einfo = eCode.getError()[1]
                             else:
                                 einfo = handle[1]
                             obj.setInfo(einfo)
                             obj.setProcessed(True)
                             obj.setBroken(True)
                             obj.setStatus(handle[0] + ' : ' + str(handle[1]))
                             obj.setDLTime(ted-ts)
                             obj.setSize('<Unknown>')
                             obj.setLastModified('<Unknown>')
                             obj.setType('<Unknown>')
                             obj.setCheckTime(ted-ts)
                             
                             print 'Broken ' + str(obj.get()) 
                             self.__printError('Broken Link ' + str(obj.get()));
                         else:
                             ts = time.time()
                             htmlData = urllib2.urlopen(url)
                             ted = time.time()
                             data = etree.HTML(htmlData.read())
                             dlTime = ted - ts
                             title = self.__getURLTitle(data)
                             links = self.__links(data)
                             (lTtype, lastChagned, size) = self.__getURLInfo(htmlData)
                             status = 'OK'
                             urlObj = URLLinks(url, title, url, title, isProcessed=True, isBroken=False, size=size, dlTime=dlTime, lastModified=lastChagned, 
                                               info='Successfully Processed', status=status, lType=lTtype)
                             
                             for link in links:
                                 cLink = str(link.attrib['href']).lstrip().rstrip()
                                 if ( cLink.startswith('#') or cLink.startswith('.') or cLink.startswith('..') or url not in cLink):
                                     cLink = urlparse.urljoin(url, cLink)
                                 
                                 if ( urlparse.urlparse(self.__dict__['_url']).netloc in cLink):
                                     cTitle = link.text
                                     temp = URLLinks(url, title, cLink, cTitle, status='UNPROCESSED')
                                     urlObj.addChild(temp)
                             te = time.time()
                             cTime = te - ts
                             urlObj.setCheckTime(cTime)
                             Deadcheck.__levelBasedLinks[level+1].append(urlObj)
                             t2 = time.time()
                             obj.setInfo('Successfully Processed.')
                             obj.setProcessed(True)
                             obj.setBroken(False)
                             obj.setStatus('OK')
                             obj.setDLTime(dlTime)
                             obj.setSize(size)
                             obj.setLastModified(lastChagned)
                             obj.setType(lTtype)
                             obj.setCheckTime(t2-t1)
                     else:
                             if ( self.__checkExempt(url)):
                                 obj.setInfo('Exempted based on the Input file : ' + self.__dict__['_exempt'])
                                 obj.setStatus('EXEMPTED')
                                 self.__printWarning("URL Exempted : " + url);
                             elif ( 'javascript' in url ):
                                 obj.setInfo('Javascript Links are not processed. Implementation underway.')
                                 obj.setStatus('WARNING')
                             else:
                                 obj.setInfo('URL Already Processed. Will not be processed again.')
                                 obj.setStatus('SKIPPED')
                                 self.__printWarning("Skipping URL : " + url);    
                             obj.setProcessed(True)
                             obj.setBroken(False)
                             obj.setDLTime(None)
                             obj.setSize(None)
                             obj.setLastModified(None)
                             obj.setType(None)
                             obj.setCheckTime(None)