def getURLInfo(self, url=None): ''' @see: IURLInfoService.getURLInfo ''' if not url: raise InputError('Invalid URL %s' % url) assert isinstance(url, str), 'Invalid URL %s' % url url = unquote(url) try: with urlopen(url) as conn: urlInfo = URLInfo() urlInfo.URL = url urlInfo.Date = datetime.now() contentType = None for tag, val in conn.info().items(): if tag == 'Content-Type': contentType = val.split(';')[0].strip().lower(); break if not contentType or contentType != 'text/html': req = Request(url) selector = req.get_selector().strip('/') if selector: parts = selector.split('/') if parts: urlInfo.Title = parts[len(parts) - 1] else: urlInfo.Title = req.get_host() return urlInfo elif contentType == 'text/html': urlInfo.ContentType = contentType extr = HTMLInfoExtractor(urlInfo) try: extr.feed(conn.read().decode()) except (AssertionError, HTMLParseError, UnicodeDecodeError): pass return extr.urlInfo except (URLError, ValueError): raise InputError('Invalid URL %s' % url)
def getURLInfo(self, url=None): ''' @see: IURLInfoService.getURLInfo ''' if not url: raise InputError('Invalid URL %s' % url) assert isinstance(url, str), 'Invalid URL %s' % url url = unquote(url) try: with urlopen(url) as conn: urlInfo = URLInfo() urlInfo.URL = url urlInfo.Date = datetime.now() contentType = None charset = 'utf_8' for tag, val in conn.info().items(): if tag == 'Content-Type': contentTypeInfo = val.split(';') contentType = contentTypeInfo[0].strip().lower() if 2 == len(contentTypeInfo): charset = contentTypeInfo[1].split('=')[1] break if not contentType or contentType != 'text/html': req = Request(url) selector = req.get_selector().strip('/') if selector: parts = selector.split('/') if parts: urlInfo.Title = parts[len(parts) - 1] else: urlInfo.Title = req.get_host() return urlInfo elif contentType == 'text/html': urlInfo.ContentType = contentType extr = HTMLInfoExtractor(urlInfo) try: readData = conn.read() decodedData = '' try: decodedData = readData.decode(charset, 'ignore') except Exception as e: decodedData = readData.decode('utf_8', 'ignore') for onePair in self.html_fixes: decodedData = re.sub(onePair['from'], onePair['to'], decodedData) extr.feed(decodedData) except (AssertionError, HTMLParseError, UnicodeDecodeError): pass return extr.urlInfo except (URLError, ValueError): raise InputError('Invalid URL %s' % url)
def getURLInfo(self, url=None): ''' @see: IURLInfoService.getURLInfo ''' if not url: raise InputError('Invalid URL %s' % url) assert isinstance(url, str), 'Invalid URL %s' % url url = unquote(url) try: with urlopen(url) as conn: urlInfo = URLInfo() urlInfo.URL = url urlInfo.Date = datetime.now() contentType = None charset = 'utf_8' for tag, val in conn.info().items(): if tag == 'Content-Type': contentTypeInfo = val.split(';') contentType = contentTypeInfo[0].strip().lower(); if 2 == len(contentTypeInfo): charset = contentTypeInfo[1].split('=')[1] break if not contentType or contentType != 'text/html': req = Request(url) selector = req.get_selector().strip('/') if selector: parts = selector.split('/') if parts: urlInfo.Title = parts[len(parts) - 1] else: urlInfo.Title = req.get_host() return urlInfo elif contentType == 'text/html': urlInfo.ContentType = contentType extr = HTMLInfoExtractor(urlInfo) try: readData = conn.read() decodedData = '' try: decodedData = readData.decode(charset, 'ignore') except Exception: decodedData = readData.decode('utf_8', 'ignore') for onePair in self.html_fixes: decodedData = re.sub(onePair['from'], onePair['to'], decodedData) extr.feed(decodedData) except (AssertionError, HTMLParseError, UnicodeDecodeError): pass return extr.urlInfo except (URLError, ValueError): raise InputError('Invalid URL %s' % url)
def getURLInfo(self, url=None): ''' @see: IURLInfoService.getURLInfo ''' if not url: raise InputError('Invalid URL %s' % url) assert isinstance(url, str), 'Invalid URL %s' % url url = unquote(url) try: with urlopen(url) as conn: urlInfo = URLInfo() urlInfo.URL = url urlInfo.Date = datetime.now() contentType = None for tag, val in conn.info().items(): if tag == 'Content-Type': contentType = val.split(';')[0].strip().lower() break if not contentType or contentType != 'text/html': req = Request(url) selector = req.get_selector().strip('/') if selector: parts = selector.split('/') if parts: urlInfo.Title = parts[len(parts) - 1] else: urlInfo.Title = req.get_host() return urlInfo elif contentType == 'text/html': urlInfo.ContentType = contentType extr = HTMLInfoExtractor(urlInfo) try: extr.feed(conn.read().decode()) except (AssertionError, HTMLParseError, UnicodeDecodeError): pass return extr.urlInfo except (URLError, ValueError): raise InputError('Invalid URL %s' % url)