Exemple #1
0
    def getURLInfo(self, url=None):
        '''
        @see: IURLInfoService.getURLInfo
        '''
        if not url: raise InputError('Invalid URL %s' % url)
        assert isinstance(url, str), 'Invalid URL %s' % url
        url = unquote(url)

        try:
            with urlopen(url) as conn:
                urlInfo = URLInfo()
                urlInfo.URL = url
                urlInfo.Date = datetime.now()
                contentType = None
                for tag, val in conn.info().items():
                    if tag == 'Content-Type': contentType = val.split(';')[0].strip().lower(); break
                if not contentType or contentType != 'text/html':
                    req = Request(url)
                    selector = req.get_selector().strip('/')
                    if selector:
                        parts = selector.split('/')
                        if parts: urlInfo.Title = parts[len(parts) - 1]
                    else:
                        urlInfo.Title = req.get_host()
                    return urlInfo
                elif contentType == 'text/html': urlInfo.ContentType = contentType
                extr = HTMLInfoExtractor(urlInfo)
                try: extr.feed(conn.read().decode())
                except (AssertionError, HTMLParseError, UnicodeDecodeError): pass
                return extr.urlInfo
        except (URLError, ValueError): raise InputError('Invalid URL %s' % url)
Exemple #2
0
    def getURLInfo(self, url=None):
        '''
        @see: IURLInfoService.getURLInfo
        '''
        if not url: raise InputError('Invalid URL %s' % url)
        assert isinstance(url, str), 'Invalid URL %s' % url
        url = unquote(url)

        try:
            with urlopen(url) as conn:
                urlInfo = URLInfo()
                urlInfo.URL = url
                urlInfo.Date = datetime.now()
                contentType = None
                charset = 'utf_8'
                for tag, val in conn.info().items():
                    if tag == 'Content-Type':
                        contentTypeInfo = val.split(';')
                        contentType = contentTypeInfo[0].strip().lower()
                        if 2 == len(contentTypeInfo):
                            charset = contentTypeInfo[1].split('=')[1]
                        break
                if not contentType or contentType != 'text/html':
                    req = Request(url)
                    selector = req.get_selector().strip('/')
                    if selector:
                        parts = selector.split('/')
                        if parts: urlInfo.Title = parts[len(parts) - 1]
                    else:
                        urlInfo.Title = req.get_host()
                    return urlInfo
                elif contentType == 'text/html':
                    urlInfo.ContentType = contentType
                extr = HTMLInfoExtractor(urlInfo)
                try:
                    readData = conn.read()
                    decodedData = ''
                    try:
                        decodedData = readData.decode(charset, 'ignore')
                    except Exception as e:
                        decodedData = readData.decode('utf_8', 'ignore')
                    for onePair in self.html_fixes:
                        decodedData = re.sub(onePair['from'], onePair['to'],
                                             decodedData)
                    extr.feed(decodedData)
                except (AssertionError, HTMLParseError, UnicodeDecodeError):
                    pass
                return extr.urlInfo
        except (URLError, ValueError):
            raise InputError('Invalid URL %s' % url)
Exemple #3
0
    def getURLInfo(self, url=None):
        '''
        @see: IURLInfoService.getURLInfo
        '''
        if not url: raise InputError('Invalid URL %s' % url)
        assert isinstance(url, str), 'Invalid URL %s' % url
        url = unquote(url)

        try:
            with urlopen(url) as conn:
                urlInfo = URLInfo()
                urlInfo.URL = url
                urlInfo.Date = datetime.now()
                contentType = None
                charset = 'utf_8'
                for tag, val in conn.info().items():
                    if tag == 'Content-Type':
                        contentTypeInfo = val.split(';')
                        contentType = contentTypeInfo[0].strip().lower();
                        if 2 == len(contentTypeInfo):
                            charset = contentTypeInfo[1].split('=')[1]
                        break
                if not contentType or contentType != 'text/html':
                    req = Request(url)
                    selector = req.get_selector().strip('/')
                    if selector:
                        parts = selector.split('/')
                        if parts: urlInfo.Title = parts[len(parts) - 1]
                    else:
                        urlInfo.Title = req.get_host()
                    return urlInfo
                elif contentType == 'text/html': urlInfo.ContentType = contentType
                extr = HTMLInfoExtractor(urlInfo)
                try:
                    readData = conn.read()
                    decodedData = ''
                    try:
                        decodedData = readData.decode(charset, 'ignore')
                    except Exception:
                        decodedData = readData.decode('utf_8', 'ignore')
                    for onePair in self.html_fixes:
                        decodedData = re.sub(onePair['from'], onePair['to'], decodedData)
                    extr.feed(decodedData)
                except (AssertionError, HTMLParseError, UnicodeDecodeError): pass
                return extr.urlInfo
        except (URLError, ValueError): raise InputError('Invalid URL %s' % url)
Exemple #4
0
    def getURLInfo(self, url=None):
        '''
        @see: IURLInfoService.getURLInfo
        '''
        if not url: raise InputError('Invalid URL %s' % url)
        assert isinstance(url, str), 'Invalid URL %s' % url
        url = unquote(url)

        try:
            with urlopen(url) as conn:
                urlInfo = URLInfo()
                urlInfo.URL = url
                urlInfo.Date = datetime.now()
                contentType = None
                for tag, val in conn.info().items():
                    if tag == 'Content-Type':
                        contentType = val.split(';')[0].strip().lower()
                        break
                if not contentType or contentType != 'text/html':
                    req = Request(url)
                    selector = req.get_selector().strip('/')
                    if selector:
                        parts = selector.split('/')
                        if parts: urlInfo.Title = parts[len(parts) - 1]
                    else:
                        urlInfo.Title = req.get_host()
                    return urlInfo
                elif contentType == 'text/html':
                    urlInfo.ContentType = contentType
                extr = HTMLInfoExtractor(urlInfo)
                try:
                    extr.feed(conn.read().decode())
                except (AssertionError, HTMLParseError, UnicodeDecodeError):
                    pass
                return extr.urlInfo
        except (URLError, ValueError):
            raise InputError('Invalid URL %s' % url)