def fetch(url): result = {} fetcher = ContentFetcher(url, tried=2) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return result try: htmlelement = lxml.html.fromstring(content) except Exception: logging.error('Failed to load html from content.') return result match = pyquery.PyQuery(htmlelement)('head meta[name=keywords]') if match: mainElement = match[0] keywords = mainElement.get('content') if keywords: result['keywords'] = lxmlutil.getPureString(keywords) match = pyquery.PyQuery(htmlelement)('head meta[name=description]') if match: mainElement = match[0] description = mainElement.get('content') if description: result['description'] = lxmlutil.getPureString(description) match = pyquery.PyQuery(htmlelement)('head title') if match: mainElement = match[0] title = mainElement.text_content() if title: result['title'] = lxmlutil.getPureString(title) return result
def _fetchContent(data, triedcount, feedback): fetchurl = data['fetchurl'] header = data.get('header') encoding = data.get('encoding') fetcher = ContentFetcher(fetchurl, header=header, encoding=encoding, tried=triedcount) fetchResult = fetcher.fetch(feedback) content = fetchResult.get('content') urlUsed = fetchResult.get('url') return urlUsed, content
def post(self): data = json.loads(self.request.body) items = data['items'] origin = data['origin'] header = data.get('header') for item in items: url = item.get('url') if not url: continue fetcher = ContentFetcher(url, header=header, tried=2) fetchResult = fetcher.fetch() usedUrl = fetchResult.get('url') content = fetchResult.get('content') if not content: logging.error('Failed to get content from %s.' % (url, )) continue item['url'] = usedUrl try: editorFormat = globalconfig.getEditorFormat() page = pageanalyst.analyse(usedUrl, content, editorFormat=editorFormat, monitorTitle=item.get('title')) if not item.get('title') and page.get('title'): item['title'] = page['title'] if not item.get('published') and page.get('published') \ and not page['published'].endswith('0000'): # if no hour, minute, published is not precise enough item['published'] = page['published'] if origin.get('timezone'): item['published'] = dateutil.adjustDate14(item['published'], origin['timezone']) if not item.get('content') and page.get('content'): item['content'] = page['content'] if not item.get('img') and page.get('images'): item['img'] = page['images'][0] except Exception: logging.exception('Error happens when analyse %s.' % (usedUrl, )) responseData = { 'origin': data['origin'], 'items': items, } self.response.headers['Content-Type'] = 'text/plain' callbackurl = data['callbackurl'] success = networkutil.postData(callbackurl, responseData, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (data['origin'], callbackurl) else: message = 'Failed to push items back for %s to %s.' % (data['origin'], callbackurl) logging.info(message) self.response.out.write(message)
def fetch(url): parseresult = urlparse.urlparse(url) queryurl = 'http://data.alexa.com/data?cli=10&url=%s' % (parseresult.netloc, ) result = {} fetcher = ContentFetcher(queryurl, tried=2) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return result tree = lxmlutil.parseFromUnicode(content) alexa = getAlexaInfo(tree) if alexa: result['alexa'] = alexa dmoz = getDmozInfo(tree) if dmoz: result['dmoz'] = dmoz return result
def _detectDetailUrl(url, title): tried = 2 fetcher = ContentFetcher(url,tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return None docelement = lxml.html.fromstring(content) aElements = pyquery.PyQuery(docelement)('a') for aElement in aElements: if lxmlutil.getCleanText(aElement) != title: continue detailUrl = aElement.get('href') if detailUrl: detailUrl = urlparse.urljoin(url, detailUrl) return detailUrl return None
def _detectDetailUrl(url, title): tried = 2 fetcher = ContentFetcher(url, tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return None docelement = lxml.html.fromstring(content) aElements = pyquery.PyQuery(docelement)('a') for aElement in aElements: if lxmlutil.getCleanText(aElement) != title: continue detailUrl = aElement.get('href') if detailUrl: detailUrl = urlparse.urljoin(url, detailUrl) return detailUrl return None
def get(self): url = self.request.get('url') page = None if url: try: url = base64.b64decode(url) url2 = '' length = len(url) for i in range(0, length, 2): if i + 1 < length: url2 += url[i+1] + url[i] if length % 2 != 0: url2 += url[-1] url = url2 except TypeError: pass key = stringutil.calculateHash([url]) page = memcache.get(key) contentGot = bool(page) if not page: tried = 2 # the max try count is 3 fetcher = ContentFetcher(url, tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') if content: editorFormat = globalconfig.getEditorFormat() page = pageanalyst.analyse(url, content, editorFormat=editorFormat) if page: page['url'] = url if page and (page.get('content') or page.get('images')): memcache.set(key, page) contentGot = True if not contentGot: page = {'url': url} self.redirect(url, permanent=True) return if 'images' in page: for image in page['images']: image['url'] = '/image/?url=' + urllib.quote(image['url'].encode('utf-8')) templateValues = { 'page': page, } self.render(templateValues, 'home.html')
def post(self): url = self.request.get('url') title = self.request.get('title') fetchResult = {} content = None page = None fortest = bool(self.request.get('fortest')) httpheader = self.request.get('httpheader') header = None if httpheader: header = json.loads(httpheader) if url: tried = 2 # the max try count is 3 fetcher = ContentFetcher(url, header=header, tried=tried ) fetchResult = fetcher.fetch() content = fetchResult.get('content') elementResult = {} if content: editorFormat = globalconfig.getEditorFormat() page = pageanalyst.analyse(url, content, editorFormat=editorFormat, monitorTitle=title, fortest=fortest, elementResult=elementResult) if header: httpheader = jsonutil.getReadableString(header) templateValues = { 'url': url, 'title': title, 'fortest': fortest, 'httpheader': httpheader, 'encoding': fetchResult.get('encoding'), 'encodingSrc': fetchResult.get('encoding.src'), 'oldContent': fetchResult.get('content.old'), 'content': fetchResult.get('content'), 'pagestr': jsonutil.getReadableString(page), 'page': page, 'elementResult': elementResult, } self.render(templateValues, 'test.html')
""" Fetch the url provided and retrieve links, subsequently fetching the pages at those links until reaching limit (or running out of links). :param start_url: url to start from :param limit: number of urls to return in list :return: list of urls discovered """ urls = [start_url] seen = {start_url: True} count = 1 while len(urls) > 0 and count < limit: url = urls.pop() contents = self.content_fetcher.retrieve_page(url) new_urls = filter(lambda x: x not in seen, extract_urls(url, contents)) for new_url in new_urls: if count == limit: break urls.append(new_url) seen[new_url] = True count += 1 return list(seen.keys()) if __name__ == "__main__": parser = setup_argument_parser() args = parser.parse_args() web_crawler = WebCrawler(ContentFetcher(args.agents)) found_urls = web_crawler.discover(args.url, limit=args.limit) for url in found_urls: print(url)
def post(self): action = self.request.get('action') keyword = '' pageinfo = None if action == 'JSON': jsonstr = self.request.get('jsonstr') if jsonstr: newssource = json.loads(jsonstr) else: newssource = _DEFAULT_NEWSSOURCE encodingUsed = '' urlUsed = '' content = '' httpheader = '' formatter = '' else: keyword = self.request.get('keyword').strip() pageinfo = self.request.get('pageinfo').strip() if pageinfo: pageinfo = json.loads(pageinfo) newssource = {} newssource['active'] = bool(self.request.get('active')) newssource['slug'] = self.request.get('slug') newssource['name'] = self.request.get('name') newssource['order'] = self.request.get('order') newssource['charts'] = bool(self.request.get('charts')) newssource['fetchurl'] = self.request.get('fetchurl') if newssource['fetchurl'] and not newssource['fetchurl'].startswith('http'): newssource['fetchurl'] = 'http://' + newssource['fetchurl'] if not newssource['slug'] and newssource['fetchurl']: newssource['slug'] = urlparse.urlparse(newssource['fetchurl']).netloc httpheader = self.request.get('httpheader') if httpheader: newssource['header'] = json.loads(httpheader) newssource['encoding'] = self.request.get('encoding') newssource['tags'] = self.request.get('tags') # following fields only for showing parsed result. encodingUsed = self.request.get('encodingUsed') urlUsed = self.request.get('urlUsed') oldContent = self.request.get('oldContent') newssource['selector'] = self.request.get('selector').strip() conditions = {} conditions['returnall'] = bool(self.request.get('returnall')) conditions['emptytitle'] = bool(self.request.get('emptytitle')) conditions['detectdetail'] = bool(self.request.get('detectdetail')) conditions['scripttext'] = bool(self.request.get('scripttext')) excludeselector = self.request.get('excludeselector').strip() if excludeselector: if 'exclude' not in conditions: conditions['exclude'] = {} conditions['exclude']['selector'] = excludeselector includeselector = self.request.get('includeselector').strip() if includeselector: if 'include' not in conditions: conditions['include'] = {} conditions['include']['selector'] = includeselector urlselector = self.request.get('urlselector').strip() titleselector = self.request.get('titleselector').strip() imageselector = self.request.get('imageselector').strip() contentselector = self.request.get('contentselector').strip() linkselector = self.request.get('linkselector').strip() imagelinkselector = self.request.get('imagelinkselector').strip() if urlselector or titleselector or contentselector or \ imageselector or linkselector or imagelinkselector: conditions['criterion'] = {} if urlselector: conditions['criterion']['url'] = urlselector if titleselector: conditions['criterion']['title'] = titleselector if contentselector: conditions['criterion']['content'] = contentselector if imageselector: conditions['criterion']['image'] = imageselector if linkselector: conditions['criterion']['link'] = linkselector if imagelinkselector: conditions['criterion']['imagelink'] = imagelinkselector newssource['conditions'] = conditions formatter = self.request.get('formatter') if formatter: newssource['formatter'] = json.loads(formatter) newssource['description'] = self.request.get('description').strip() content = self.request.get('content') jsonstr = jsonutil.getReadableString(newssource) if 'active' not in newssource: newssource['active'] = True items = [] links = [] selector = newssource.get('selector') fetchurl = newssource.get('fetchurl') tried = 2 # the max try count is 3 if not content and fetchurl: fetcher = ContentFetcher(fetchurl, header=newssource.get('header'), encoding=newssource.get('encoding'), tried=tried ) fetchResult = fetcher.fetch() content = fetchResult.get('content') oldContent = fetchResult.get('content.old') urlUsed = fetchResult.get('url') encodingUsed = '%s-%s' % (fetchResult.get('encoding'), fetchResult.get('encoding.src')) if content: content = lxmlutil.removeEncodingDeclaration(content) if selector: parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, newssource.get('conditions'), newssource.get('formatter')) else: links = linkdetector.detect(content, keyword) if items and newssource.get('conditions', {}).get('detectdetail'): detaildetector.populateDetailUrls(items) if newssource.get('header'): httpheader = jsonutil.getReadableString(newssource['header']) if newssource.get('formatter'): formatter = jsonutil.getReadableString(newssource['formatter']) if not pageinfo and fetchurl: pageinfo = pmapi.getPage(fetchurl) templateValues = { 'newssource': newssource, 'httpheader': httpheader, 'formatter': formatter, 'content': content, 'oldContent': oldContent, 'encodingUsed': encodingUsed, 'urlUsed': urlUsed, 'keyword': keyword, 'links': links, 'items': items, 'jsonstr': jsonstr, 'pageinfo': pageinfo, 'strpageinfo': json.dumps(pageinfo), } self._render(templateValues)
class TestContentFetcher(unittest.TestCase): user_agents = ["Mozilla", "Python", "Something Else"] url = "https://crawler-test.com/" def setUp(self) -> None: self.content_fetcher = ContentFetcher(self.user_agents) def test_get_next_user_agent_cycles_through_agents(self): self.assertEqual(self.content_fetcher.get_next_user_agent(), "Mozilla") self.assertEqual(self.content_fetcher.get_next_user_agent(), "Python") self.assertEqual(self.content_fetcher.get_next_user_agent(), "Something Else") self.assertEqual(self.content_fetcher.get_next_user_agent(), "Mozilla") def test_get_next_user_agent_cycles_returns_none_when_none_given(self): content_fetcher_no_agents = ContentFetcher([]) self.assertIsNone(content_fetcher_no_agents.get_next_user_agent()) def test_construct_request_adds_correct_headers(self): request = self.content_fetcher.construct_request(self.url) self.assertEqual(request.get_header("User-agent"), "Mozilla") self.assertEqual(request.get_header("Accept-encoding"), "gzip, deflate") self.assertEqual(request.get_full_url(), self.url) self.assertEqual(request.get_header("Accept"), "text/html") def test_decompress_content_handles_gzip(self): test_bytes = "Compress me".encode('utf-8') compressed_data = gzip.compress(test_bytes) self.assertEqual( self.content_fetcher.decompress_content(compressed_data, "gzip"), test_bytes) def test_decompress_content_handles_deflate(self): test_bytes = "Compress me".encode('utf-8') compressed_data = zlib.compress(test_bytes) self.assertEqual( self.content_fetcher.decompress_content(compressed_data, "deflate"), test_bytes) def test_decompress_content_recovers_when_unknown_format(self): test_bytes = "Compress me".encode('utf-8') compressed_data = zlib.compress(test_bytes) self.assertEqual( self.content_fetcher.decompress_content(compressed_data, "unknown"), ''.encode('utf-8')) def test_handle_response_can_handle_gzip_content(self): test_string = "Compress me" compressed_data = gzip.compress(test_string.encode('utf-8')) headers = [("Content-Encoding", "gzip")] self.assertEqual( self.content_fetcher.handle_response(headers, compressed_data), test_string) def test_handle_response_can_handle_deflate_content(self): test_string = "Compress me" compressed_data = zlib.compress(test_string.encode('utf-8')) headers = [("Content-Encoding", "deflate")] self.assertEqual( self.content_fetcher.handle_response(headers, compressed_data), test_string)
def test_get_next_user_agent_cycles_returns_none_when_none_given(self): content_fetcher_no_agents = ContentFetcher([]) self.assertIsNone(content_fetcher_no_agents.get_next_user_agent())
def setUp(self) -> None: self.content_fetcher = ContentFetcher(self.user_agents)
def post(self): action = self.request.get('action') keyword = '' pageinfo = None if action == 'JSON': jsonstr = self.request.get('jsonstr') if jsonstr: newssource = json.loads(jsonstr) else: newssource = _DEFAULT_NEWSSOURCE encodingUsed = '' urlUsed = '' content = '' httpheader = '' formatter = '' else: keyword = self.request.get('keyword').strip() pageinfo = self.request.get('pageinfo').strip() if pageinfo: pageinfo = json.loads(pageinfo) newssource = {} newssource['active'] = bool(self.request.get('active')) newssource['slug'] = self.request.get('slug') newssource['name'] = self.request.get('name') newssource['order'] = self.request.get('order') newssource['charts'] = bool(self.request.get('charts')) newssource['fetchurl'] = self.request.get('fetchurl') if newssource['fetchurl'] and not newssource[ 'fetchurl'].startswith('http'): newssource['fetchurl'] = 'http://' + newssource['fetchurl'] if not newssource['slug'] and newssource['fetchurl']: newssource['slug'] = urlparse.urlparse( newssource['fetchurl']).netloc httpheader = self.request.get('httpheader') if httpheader: newssource['header'] = json.loads(httpheader) newssource['encoding'] = self.request.get('encoding') newssource['tags'] = self.request.get('tags') # following fields only for showing parsed result. encodingUsed = self.request.get('encodingUsed') urlUsed = self.request.get('urlUsed') oldContent = self.request.get('oldContent') newssource['selector'] = self.request.get('selector').strip() conditions = {} conditions['returnall'] = bool(self.request.get('returnall')) conditions['emptytitle'] = bool(self.request.get('emptytitle')) conditions['detectdetail'] = bool(self.request.get('detectdetail')) conditions['scripttext'] = bool(self.request.get('scripttext')) excludeselector = self.request.get('excludeselector').strip() if excludeselector: if 'exclude' not in conditions: conditions['exclude'] = {} conditions['exclude']['selector'] = excludeselector includeselector = self.request.get('includeselector').strip() if includeselector: if 'include' not in conditions: conditions['include'] = {} conditions['include']['selector'] = includeselector urlselector = self.request.get('urlselector').strip() titleselector = self.request.get('titleselector').strip() imageselector = self.request.get('imageselector').strip() contentselector = self.request.get('contentselector').strip() linkselector = self.request.get('linkselector').strip() imagelinkselector = self.request.get('imagelinkselector').strip() if urlselector or titleselector or contentselector or \ imageselector or linkselector or imagelinkselector: conditions['criterion'] = {} if urlselector: conditions['criterion']['url'] = urlselector if titleselector: conditions['criterion']['title'] = titleselector if contentselector: conditions['criterion']['content'] = contentselector if imageselector: conditions['criterion']['image'] = imageselector if linkselector: conditions['criterion']['link'] = linkselector if imagelinkselector: conditions['criterion']['imagelink'] = imagelinkselector newssource['conditions'] = conditions formatter = self.request.get('formatter') if formatter: newssource['formatter'] = json.loads(formatter) newssource['description'] = self.request.get('description').strip() content = self.request.get('content') jsonstr = jsonutil.getReadableString(newssource) if 'active' not in newssource: newssource['active'] = True items = [] links = [] selector = newssource.get('selector') fetchurl = newssource.get('fetchurl') tried = 2 # the max try count is 3 if not content and fetchurl: fetcher = ContentFetcher(fetchurl, header=newssource.get('header'), encoding=newssource.get('encoding'), tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') oldContent = fetchResult.get('content.old') urlUsed = fetchResult.get('url') encodingUsed = '%s-%s' % (fetchResult.get('encoding'), fetchResult.get('encoding.src')) if content: content = lxmlutil.removeEncodingDeclaration(content) if selector: parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, newssource.get('conditions'), newssource.get('formatter')) else: links = linkdetector.detect(content, keyword) if items and newssource.get('conditions', {}).get('detectdetail'): detaildetector.populateDetailUrls(items) if newssource.get('header'): httpheader = jsonutil.getReadableString(newssource['header']) if newssource.get('formatter'): formatter = jsonutil.getReadableString(newssource['formatter']) if not pageinfo and fetchurl: pageinfo = pmapi.getPage(fetchurl) templateValues = { 'newssource': newssource, 'httpheader': httpheader, 'formatter': formatter, 'content': content, 'oldContent': oldContent, 'encodingUsed': encodingUsed, 'urlUsed': urlUsed, 'keyword': keyword, 'links': links, 'items': items, 'jsonstr': jsonstr, 'pageinfo': pageinfo, 'strpageinfo': json.dumps(pageinfo), } self._render(templateValues)