def pipeline(response, spider) : """ Index a page. """ # skip rss or atom urls if not response.css("html").extract_first() : return # get domain domain = url.domain(response.url) # extract title title = response.css('title::text').extract_first() title = title.strip() if title else "" # extract description description = response.css("meta[name=description]::attr(content)").extract_first() description = description.strip() if description else "" # get main language of page, and main content of page lang = url.detect_language(response.body) if lang not in languages : raise InvalidUsage('Language not supported') body, boilerplate = url.extract_content(response.body, languages.get(lang)) # weight of page weight = 3 if not title and not description : weight = 0 elif not title : weight = 1 elif not description : weight = 2 if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) : # probably bad content quality weight -= 1 res = spider.es_client.index(index="web-%s"%lang, id=response.url, body={ "url":response.url, "domain":domain, "title":title, "description":description, "body":body, "weight":weight }) if response.status in spider.handle_httpstatus_list and 'Location' in response.headers: newurl = response.headers['Location'] meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list} meta.update(response.request.meta) return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)
def explore_job(link): """ Explore a website and index all urls (redis-rq process). """ print("explore website at : %s" % link) # get final url after possible redictions try: link = url.crawl(link).url except: return 0 # create or update domain data domain = url.domain(link) res = client.index(index="web", doc_type='domain', id=domain, body={ "homepage": link, "domain": domain, "last_crawl": datetime.now() }) # start crawler process = CrawlerProcess({ 'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'DOWNLOAD_TIMEOUT': 100, 'DOWNLOAD_DELAY': 0.25, 'ROBOTSTXT_OBEY': True, 'HTTPCACHE_ENABLED': False, 'REDIRECT_ENABLED': False, 'SPIDER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': True, 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': True, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': True, 'scrapy.extensions.closespider.CloseSpider': True }, 'CLOSESPIDER_PAGECOUNT': 500 #only for debug }) process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls=[ link, ], es_client=client, redis_conn=redis_conn) process.start() return 1
def reference_job(link, email) : """ Request the referencing of a website. """ print("referencing page %s with email %s"%(link,email)) # get final url after possible redictions try : link = url.crawl(link).url except : return 0 # create or update domain data domain = url.domain(link) res = client.index(index="web", doc_type='domain', id=domain, body={ "homepage":link, "domain":domain, "email":email }) return 1
def pipeline(response, spider) : """ Index a page. """ # skip rss or atom urls if not response.css("html").extract_first() : return # get domain domain = url.domain(response.url) # extract title title = response.css('title::text').extract_first() title = title.strip() if title else "" # extract description description = response.css("meta[name=description]::attr(content)").extract_first() description = description.strip() if description else "" # get main language of page, and main content of page lang = url.detect_language(response.body) if lang not in languages : raise InvalidUsage('Language not supported') body, boilerplate = url.extract_content(response.body, languages.get(lang)) # weight of page weight = 3 if not title and not description : weight = 0 elif not title : weight = 1 elif not description : weight = 2 if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) : # probably bad content quality weight -= 1 # -- TEST -- # """keywords = Counter() text_for_keywords = "%s\n%s\n%s"%(title, description, bestbody) r = requests.post('http://localhost:5001/keywords_from_text', data = {'text':text_for_keywords}) data = r.json() #print(hit.url, data) for k in data["keywords"] : keywords[k] += 1 keywords = " ".join(["%s "%(kw)*score for kw, score in keywords.most_common(100)])""" # index url and data res = spider.es_client.index(index="web-%s"%lang, doc_type='page', id=response.url, body={ "url":response.url, "domain":domain, "title":title, "description":description, "body":body, "weight":weight }) # try to create thumbnail from page img_link = response.css("meta[property='og:image']::attr(content)").extract_first() if not img_link : img_link = response.css("meta[name='twitter:image']::attr(content)").extract_first() if img_link : q = Queue(connection=spider.redis_conn) q.enqueue(create_thumbnail, response.url, lang, img_link) # check for redirect url if response.status in spider.handle_httpstatus_list and 'Location' in response.headers: newurl = response.headers['Location'] meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list} meta.update(response.request.meta) return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)