def keywords_from_url(): """ URL : /keywords_from_url Extract keywords from the text content of a web page. Method : POST Form data : - url : the url to analyze [string, required] - hits : limit number of keywords returned [int, optional, 100 by default] Return a JSON dictionary : {"keywords":[list of keywords]} """ # get POST data and load language resources data = dict((key, request.form.get(key)) for key in request.form.keys()) if "url" not in data : raise InvalidUsage('No url specified in POST data') # crawl url, detect main language and get main text from url url_data = url.crawl(data["url"]) if not url_data : raise InvalidUsage('No content to analyze') text_content = url.extract_content(url_data.text) # analyze text and extract keywords keywords = language.keyword_mining(text_content) # limit the number of keywords total = len(keywords) hits = int(data.get("hits", 100)) keywords = [kw for kw, score in keywords.most_common(hits)] return jsonify(keywords=keywords, total=total)
def pipeline(response, spider) : """ Index a page. """ # skip rss or atom urls if not response.css("html").extract_first() : return # get domain domain = url.domain(response.url) # extract title title = response.css('title::text').extract_first() title = title.strip() if title else "" # extract description description = response.css("meta[name=description]::attr(content)").extract_first() description = description.strip() if description else "" # get main language of page, and main content of page lang = url.detect_language(response.body) if lang not in languages : raise InvalidUsage('Language not supported') body, boilerplate = url.extract_content(response.body, languages.get(lang)) # weight of page weight = 3 if not title and not description : weight = 0 elif not title : weight = 1 elif not description : weight = 2 if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) : # probably bad content quality weight -= 1 res = spider.es_client.index(index="web-%s"%lang, id=response.url, body={ "url":response.url, "domain":domain, "title":title, "description":description, "body":body, "weight":weight }) if response.status in spider.handle_httpstatus_list and 'Location' in response.headers: newurl = response.headers['Location'] meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list} meta.update(response.request.meta) return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)
def index(): """ URL : /index Index a new URL in search engine. Method : POST Form data : - url : the url to index [string, required] Return a success message. """ # get POST data data = dict((key, request.form.get(key)) for key in request.form.keys()) if "url" not in data: raise InvalidUsage('No url specified in POST data') # crawl url url_data = url.crawl(data["url"]) if not url_data: raise InvalidUsage("URL is invalid or has no text inside") # get main language of page lang = url.detect_language(url_data.text) if lang not in languages: raise InvalidUsage('Language not supported') # extract title of url title = url.extract_title(url_data.text) # extract description of url description = url.extract_description(url_data.text) # extract main content of url body = url.extract_content(url_data.text, languages.get(lang)) # index url and data res = client.index(index="web-%s" % lang, doc_type='page', id=data["url"], body={ "title": title, "description": description, "body": body, "url": data["url"] }) return "Success"
def pipeline(response, spider) : """ Index a page. """ # skip rss or atom urls if not response.css("html").extract_first() : return # get domain domain = url.domain(response.url) # extract title title = response.css('title::text').extract_first() title = title.strip() if title else "" # extract description description = response.css("meta[name=description]::attr(content)").extract_first() description = description.strip() if description else "" # get main language of page, and main content of page lang = url.detect_language(response.body) if lang not in languages : raise InvalidUsage('Language not supported') body, boilerplate = url.extract_content(response.body, languages.get(lang)) # weight of page weight = 3 if not title and not description : weight = 0 elif not title : weight = 1 elif not description : weight = 2 if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) : # probably bad content quality weight -= 1 # -- TEST -- # """keywords = Counter() text_for_keywords = "%s\n%s\n%s"%(title, description, bestbody) r = requests.post('http://localhost:5001/keywords_from_text', data = {'text':text_for_keywords}) data = r.json() #print(hit.url, data) for k in data["keywords"] : keywords[k] += 1 keywords = " ".join(["%s "%(kw)*score for kw, score in keywords.most_common(100)])""" # index url and data res = spider.es_client.index(index="web-%s"%lang, doc_type='page', id=response.url, body={ "url":response.url, "domain":domain, "title":title, "description":description, "body":body, "weight":weight }) # try to create thumbnail from page img_link = response.css("meta[property='og:image']::attr(content)").extract_first() if not img_link : img_link = response.css("meta[name='twitter:image']::attr(content)").extract_first() if img_link : q = Queue(connection=spider.redis_conn) q.enqueue(create_thumbnail, response.url, lang, img_link) # check for redirect url if response.status in spider.handle_httpstatus_list and 'Location' in response.headers: newurl = response.headers['Location'] meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list} meta.update(response.request.meta) return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)