def createMockFetchedDatum(url, html_text, status): """Create FetchedDatum mock data.""" key = ndb.Key(CrawlDbDatum, url) crawl = CrawlDbDatum.get_or_insert(url, parent=key, url=url, last_status=status) if status != pipelines.UNFETCHED: fetched_datum = FetchedDbDatum(parent=crawl.key, url=url, fetched_url = url, fetched_content = html_text, content_type="text/html") fetched_datum.put()
def createMockFetchedDatum(url, html_text, status): """Create FetchedDatum mock data.""" key = ndb.Key(CrawlDbDatum, url) crawl = CrawlDbDatum.get_or_insert(url, parent=key, url=url, last_status=status) if status != pipelines.UNFETCHED: fetched_datum = FetchedDbDatum(parent=crawl.key, url=url, fetched_url=url, fetched_content=html_text, content_type="text/html") fetched_datum.put()
def _clean_map(crawl_db_datum): """Delete entities map function. Delete unnecessary entities, also FetchedDbDatum. Args: crawl_db_datum: The entity of crawl_db_datum. Returns: url_str: Deleted urls. """ delete_keys = [] clean_all = memcache.get(CLEAN_ALL_KEY) delete_fetched_datum = FetchedDbDatum.get_by_id(crawl_db_datum.url) if delete_fetched_datum is not None: delete_keys.append(delete_fetched_datum.key) data = ndb.Model.to_dict(crawl_db_datum) fetch_status = data.get("last_status", 2) url="" clean_all = memcache.get(CLEAN_ALL_KEY) if clean_all: delete_keys.append(crawl_db_datum.key) else: if fetch_status in [FETCHED, SKIPPED, FAILED]: delete_keys.append(crawl_db_datum.key) ndb.delete_multi(delete_keys) yield(url+"\n")
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = "successful" if self.was_aborted: status = "aborted" url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment( autoescape=True, extensions=["jinja2.ext.i18n"], loader=FileSystemLoader(os.path.join(base_dir, "templates")), ) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append(("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail( sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments )
def _clean_map(crawl_db_datum): """Delete entities map function. Delete unnecessary entities, also FetchedDbDatum. Args: crawl_db_datum: The entity of crawl_db_datum. Returns: url_str: Deleted urls. """ delete_keys = [] clean_all = memcache.get(CLEAN_ALL_KEY) delete_fetched_datum = FetchedDbDatum.get_by_id(crawl_db_datum.url) if delete_fetched_datum is not None: delete_keys.append(delete_fetched_datum.key) data = ndb.Model.to_dict(crawl_db_datum) fetch_status = data.get("last_status", 2) url = "" clean_all = memcache.get(CLEAN_ALL_KEY) if clean_all: delete_keys.append(crawl_db_datum.key) else: if fetch_status in [FETCHED, SKIPPED, FAILED]: delete_keys.append(crawl_db_datum.key) ndb.delete_multi(delete_keys) yield (url + "\n")
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 2, True) file_name1 = self.createMockData(("http://hoge_0.com/content_0", True)) file_name2 = self.createMockData( ("http://hoge_1.com/content_0", False)) static_content = "<html><body>TestContent</body></html>" self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "text/html" }) p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum is not None)
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s" % (mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/html" }) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={ "Content-Length": static_content_length, "Content-Type": "image/png" }) p = pipelines.FetcherPipeline( "FetcherPipeline", params={"entity_kind": "lakshmi.datum.CrawlDbDatum"}, parser_params={"text/html": __name__ + ".htmlParser"}, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == "http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums) > 0) contents_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums) > 0)
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url==url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={"Content-Length": static_content_length, "Content-Type": "text/html"}) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={"Content-Length": static_content_length, "Content-Type": "image/png"}) p = pipelines.FetcherPipeline("FetcherPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum" }, parser_params={ "text/html": __name__ + ".htmlParser" }, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums)>0) contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums)>0)
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = 'successful' if self.was_aborted: status = 'aborted' url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment(autoescape=True, extensions=['jinja2.ext.i18n'], loader=FileSystemLoader( os.path.join(base_dir, 'templates'))) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append( ("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render( url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail(sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments)
def testSuccessfulRun(self): """Test extract outlinks by UDF.""" resource_neg = self.getResource("cloudysunny14.html") static_content = resource_neg.read() createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED) file_name = self.createMockDataLine("http://cloudysunny14.html\n") p = pipelines._ExtractOutlinksPipeline("ExtractOutlinksPipeline", file_names=[file_name], parser_params={ "text/html": __name__+"._htmlOutlinkParser" }) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://cloudysunny14.html").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum!=None) qry = CrawlDbDatum.query(CrawlDbDatum.last_status == pipelines.UNFETCHED) crawl_db_datums = qry.fetch() self.assertTrue(len(crawl_db_datums)==0)
def testSuccessfulRun(self): """Test extract outlinks by UDF.""" resource_neg = self.getResource("cloudysunny14.html") static_content = resource_neg.read() createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED) file_name = self.createMockDataLine("http://cloudysunny14.html\n") p = pipelines._ExtractOutlinksPipeline( "ExtractOutlinksPipeline", file_names=[file_name], parser_params={"text/html": __name__ + "._htmlOutlinkParser"}) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://cloudysunny14.html").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum != None) qry = CrawlDbDatum.query( CrawlDbDatum.last_status == pipelines.UNFETCHED) crawl_db_datums = qry.fetch() self.assertTrue(len(crawl_db_datums) == 0)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 2, True) file_name1 = self.createMockData(("http://hoge_0.com/content_0", True)) file_name2 = self.createMockData(("http://hoge_1.com/content_0", False)) static_content = "<html><body>TestContent</body></html>" self.setReturnValue(content=static_content, headers={"Content-Length": len(static_content), "Content-Type": "text/html"}) p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://hoge_0.com/content_0").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum is not None)
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url==url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum(parent=crawl_db_datums[0].key, url=url, fetched_url = fetch_result.get("fetched_url"), fetch_time = fetch_result.get("time"), fetched_content = fetch_result.get("content"), content_type = fetch_result.get("mime_type"), content_size = fetch_result.get("read_rate"), response_rate = fetch_result.get("read_rate"), http_headers = str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n"%url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum( parent=crawl_db_datums[0].key, url=url, fetched_url=fetch_result.get("fetched_url"), fetch_time=fetch_result.get("time"), fetched_content=fetch_result.get("content"), content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("read_rate"), response_rate=fetch_result.get("read_rate"), http_headers=str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n" % url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url