def fetchLinks(self, url): dom = None try: response = urlopen(url) contenttype = response.info()['Content-Type'] contenttype = contenttype.strip().lower() ''' if the conten-type is not text/html the url is rejected as it could be images or something else that could not be parsed ''' if not contenttype.startswith('text/html'): self.log.debug('rejected = ' + url + ' Content-Type = ' + contenttype) return dom = parse(response).getroot() except urllib.error.HTTPError as e1: self.log.warn(''.join(['HTTP error for ', url])) self.log.debug(e1, exc_info=sys.exc_info()[2]) return except Exception as e2: self.log.warn('cannot parse ' + url) self.log.debug(e2, exc_info=sys.exc_info()[2]) return if not dom is None: links = dom.cssselect('a') if not links is None: linksList = [] for link in links: if not link is None: href = link.get('href') if href is None: continue suburl = URLValidator.formValidURL(url, href) if not suburl is None: linksList.append(suburl) else: self.log.debug('rejected href = ' + href) if not linksList: return None else: return linksList
def fetchLinks(self,url): dom = None try: response = urlopen(url) contenttype = response.info()['Content-Type'] contenttype = contenttype.strip().lower() ''' if the conten-type is not text/html the url is rejected as it could be images or something else that could not be parsed ''' if not contenttype.startswith('text/html'): self.log.debug('rejected = '+url+' Content-Type = '+contenttype) return dom = parse(response).getroot() except urllib.error.HTTPError as e1: self.log.warn(''.join(['HTTP error for ',url])) self.log.debug(e1,exc_info=sys.exc_info()[2]) return except Exception as e2: self.log.warn('cannot parse '+url) self.log.debug(e2,exc_info=sys.exc_info()[2]) return if not dom is None: links = dom.cssselect('a') if not links is None: linksList = [] for link in links: if not link is None: href = link.get('href') if href is None: continue suburl = URLValidator.formValidURL(url,href) if not suburl is None: linksList.append(suburl) else: self.log.debug('rejected href = '+href) if not linksList: return None else: return linksList
f = open(lock,'w+') f.close() except: pass #even if lock file cannot be created proceed with the rest of the code log = None url = None maxlinks = None cmdlength = len(sys.argv) if cmdlength < 2 : print("Please enter url to crawl and the maximum number of links to be extracted which is an optional one") print("Eg 1: python Main.py http://python.org 50") print("Eg 2: python Main.py http://python.org") sys.exit() elif cmdlength == 2 or cmdlength == 3: url = str(sys.argv[1]) if not URLValidator.isValidURL(url): print("Either your url is not valid or a format that cannot be crawled") sys.exit() if cmdlength == 3: try: maxlinks = int(sys.argv[2]) except ValueError: print('Invalid maximum links') sys.exit() if maxlinks < 1: print("maximum links should be minimum 1") sys.exit() else: print("Invalid number of arguments") sys.exit() try: