def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = "successful" if self.was_aborted: status = "aborted" url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment( autoescape=True, extensions=["jinja2.ext.i18n"], loader=FileSystemLoader(os.path.join(base_dir, "templates")), ) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append(("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail( sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments )
def testSuccessfulRun(self): file_name1 = self.createMockData( ("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData( ("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent=ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "image/png" }) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query( CrawlDbDatum.url == "https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums) > 0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/html" }) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={ "Content-Length": static_content_length, "Content-Type": "image/png" }) p = pipelines.FetcherPipeline( "FetcherPipeline", params={"entity_kind": "lakshmi.datum.CrawlDbDatum"}, parser_params={"text/html": __name__ + ".htmlParser"}, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == "http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums) > 0) contents_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums) > 0)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={"Content-Length": static_content_length, "Content-Type": "text/html"}) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={"Content-Length": static_content_length, "Content-Type": "image/png"}) p = pipelines.FetcherPipeline("FetcherPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum" }, parser_params={ "text/html": __name__ + ".htmlParser" }, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums)>0) contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums)>0)
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = 'successful' if self.was_aborted: status = 'aborted' url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment(autoescape=True, extensions=['jinja2.ext.i18n'], loader=FileSystemLoader( os.path.join(base_dir, 'templates'))) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append( ("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render( url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail(sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments)
def testSuccessfulRun(self): file_name1 = self.createMockData(("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData(("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent =ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={"Content-Length": len(static_content), "Content-Type": "image/png"}) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query(CrawlDbDatum.url=="https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums)>0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))