def spider_run(self): for url_hash, url, stage, page, encode, context, created_at, page_crawled_at in self.doc_raw_adapter.load_unparsed_doc_raw(): try: self.logger.log("parsing [%s]."%(url_hash)) features, images, next_update_time, children = self.parse(url_hash, page, encode, stage, context, created_at, page_crawled_at) if images != None: for i in range(0, len(images)): try: image_id = common_utils.gen_url_hash(images[i]["url"]) if not self.image_store_adapter.has_image_index_by_image_id(image_id): images[i]["image_id"] = image_id self.image_store_adapter.create_image_index(image_id, images[i]["image_format"], images[i]["url"]) self.logger.log("image [%s] created for [%s]."%(image_id, url_hash)) except BaseException, e: self.logger.log("Error occured when creating image index: %s"%(e)) if features != None: if not self.url_hash_exists_in_data_raw(url_hash): self.data_raw_adapter.create_data_raw(url_hash, url, features, images) self.logger.log("features for [%s] is added."%(url_hash)) else: self.data_raw_adapter.update_data_raw(url_hash, features, images) self.logger.log("features for [%s] is updated."%(url_hash)) children_url_hashes = None if children != None: children_url_hashes = [] for child in children: try: url_new = child["url"] url_hash_new = common_utils.gen_url_hash(child["url"]) stage_new = child["stage"] context_new = child["context"] operation_flag = child["operation_flag"] self.explore_child(url_hash, url_new, url_hash_new, stage_new, context_new, operation_flag) children_url_hashes.append(url_hash_new) except BaseException, e: self.logger.log("Error occured when exploring child: %s"%(e)) self.doc_raw_adapter.update_doc_raw_with_node_info(url_hash, next_update_time = next_update_time, children = children_url_hashes, status_flag = DocRawStatus.DATA_PARSED)
def url_exists_in_doc_raw(self, url): url_hash = common_utils.gen_url_hash(url) return self.doc_raw_adapter.has_doc_raw_by_url_hash(url_hash)
def spider_run(self): for url_hash, url, stage, page, encode, context, created_at, page_crawled_at in self.doc_raw_adapter.load_unparsed_doc_raw( ): try: self.logger.log("parsing [%s]." % (url_hash)) features, images, next_update_time, children = self.parse( url_hash, page, encode, stage, context, created_at, page_crawled_at) if images != None: for i in range(0, len(images)): try: image_id = common_utils.gen_url_hash( images[i]["url"]) if not self.image_store_adapter.has_image_index_by_image_id( image_id): images[i]["image_id"] = image_id self.image_store_adapter.create_image_index( image_id, images[i]["image_format"], images[i]["url"]) self.logger.log( "image [%s] created for [%s]." % (image_id, url_hash)) except BaseException, e: self.logger.log( "Error occured when creating image index: %s" % (e)) if features != None: if not self.url_hash_exists_in_data_raw(url_hash): self.data_raw_adapter.create_data_raw( url_hash, url, features, images) self.logger.log("features for [%s] is added." % (url_hash)) else: self.data_raw_adapter.update_data_raw( url_hash, features, images) self.logger.log("features for [%s] is updated." % (url_hash)) children_url_hashes = None if children != None: children_url_hashes = [] for child in children: try: url_new = child["url"] url_hash_new = common_utils.gen_url_hash( child["url"]) stage_new = child["stage"] context_new = child["context"] operation_flag = child["operation_flag"] self.explore_child(url_hash, url_new, url_hash_new, stage_new, context_new, operation_flag) children_url_hashes.append(url_hash_new) except BaseException, e: self.logger.log( "Error occured when exploring child: %s" % (e)) self.doc_raw_adapter.update_doc_raw_with_node_info( url_hash, next_update_time=next_update_time, children=children_url_hashes, status_flag=DocRawStatus.DATA_PARSED)
source_name = sys.argv[i] seed_path = "seeds/" + source_name + ".xml" if os.path.exists(seed_path): print("Loading seed from file %s..."%(seed_path)) f = open(seed_path, "r") seeds = BeautifulSoup(f.read()) f.close() doc_raw_adapter = DocRawAdapter(database_config_path, source_name) for seed in seeds.findAll("seed"): url = seed.url.string stage = seed.stage.string context = {} for content in seed.context.findAll(): content_type = content.get("type") if content_type == "int": context[content.name] = int(content.string) else: context[content.name] = content.string url_hash = common_utils.gen_url_hash(url) if not doc_raw_adapter.has_doc_raw_by_url_hash(url_hash): doc_raw_adapter.create_doc_raw(url_hash, url, stage, context) print("%s added into %s"%(url, source_name)) else: print("Can't find seed file %s"%(seed_path))
if __name__ == '__main__': for i in range(1, len(sys.argv)): source_name = sys.argv[i] seed_path = "seeds/" + source_name + ".xml" if os.path.exists(seed_path): print("Loading seed from file %s..." % (seed_path)) f = open(seed_path, "r") seeds = BeautifulSoup(f.read()) f.close() doc_raw_adapter = DocRawAdapter(database_config_path, source_name) for seed in seeds.findAll("seed"): url = seed.url.string stage = seed.stage.string context = {} for content in seed.context.findAll(): content_type = content.get("type") if content_type == "int": context[content.name] = int(content.string) else: context[content.name] = content.string url_hash = common_utils.gen_url_hash(url) if not doc_raw_adapter.has_doc_raw_by_url_hash(url_hash): doc_raw_adapter.create_doc_raw(url_hash, url, stage, context) print("%s added into %s" % (url, source_name)) else: print("Can't find seed file %s" % (seed_path))