def createMockFetchedDatum(url, html_text, status): """Create FetchedDatum mock data.""" key = ndb.Key(CrawlDbDatum, url) crawl = CrawlDbDatum.get_or_insert(url, parent=key, url=url, last_status=status) if status != pipelines.UNFETCHED: fetched_datum = FetchedDbDatum(parent=crawl.key, url=url, fetched_url = url, fetched_content = html_text, content_type="text/html") fetched_datum.put()
def createMockFetchedDatum(url, html_text, status): """Create FetchedDatum mock data.""" key = ndb.Key(CrawlDbDatum, url) crawl = CrawlDbDatum.get_or_insert(url, parent=key, url=url, last_status=status) if status != pipelines.UNFETCHED: fetched_datum = FetchedDbDatum(parent=crawl.key, url=url, fetched_url=url, fetched_content=html_text, content_type="text/html") fetched_datum.put()
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url==url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum(parent=crawl_db_datums[0].key, url=url, fetched_url = fetch_result.get("fetched_url"), fetch_time = fetch_result.get("time"), fetched_content = fetch_result.get("content"), content_type = fetch_result.get("mime_type"), content_size = fetch_result.get("read_rate"), response_rate = fetch_result.get("read_rate"), http_headers = str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n"%url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum( parent=crawl_db_datums[0].key, url=url, fetched_url=fetch_result.get("fetched_url"), fetch_time=fetch_result.get("time"), fetched_content=fetch_result.get("content"), content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("read_rate"), response_rate=fetch_result.get("read_rate"), http_headers=str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n" % url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url