Esempio n. 1
0
    def spider_run(self):
        for url_hash, url, stage, page, encode, context, created_at, page_crawled_at in self.doc_raw_adapter.load_unparsed_doc_raw():
            try:
                self.logger.log("parsing [%s]."%(url_hash))
                features, images, next_update_time, children = self.parse(url_hash, page, encode, stage, context, created_at, page_crawled_at)
                if images != None:
                    for i in range(0, len(images)):
                        try:
                            image_id = common_utils.gen_url_hash(images[i]["url"])
                            if not self.image_store_adapter.has_image_index_by_image_id(image_id):
                                images[i]["image_id"] = image_id
                                self.image_store_adapter.create_image_index(image_id, images[i]["image_format"], images[i]["url"])
                                self.logger.log("image [%s] created for [%s]."%(image_id, url_hash))
                        except BaseException, e:
                            self.logger.log("Error occured when creating image index: %s"%(e))
                
                if features != None:
                    if not self.url_hash_exists_in_data_raw(url_hash):
                        self.data_raw_adapter.create_data_raw(url_hash, url, features, images)
                        self.logger.log("features for [%s] is added."%(url_hash))
                    else:
                        self.data_raw_adapter.update_data_raw(url_hash, features, images)
                        self.logger.log("features for [%s] is updated."%(url_hash))

                children_url_hashes = None 
                if children != None:
                    children_url_hashes = []
                    for child in children:
                        try:
                            url_new = child["url"]
                            url_hash_new = common_utils.gen_url_hash(child["url"])
                            stage_new = child["stage"]
                            context_new = child["context"]
                            operation_flag = child["operation_flag"]
                            
                            self.explore_child(url_hash, url_new, url_hash_new, stage_new, context_new, operation_flag)
                            
                            children_url_hashes.append(url_hash_new)
                        except BaseException, e:
                            self.logger.log("Error occured when exploring child: %s"%(e))
                
                self.doc_raw_adapter.update_doc_raw_with_node_info(url_hash, 
                                                                   next_update_time = next_update_time, 
                                                                   children = children_url_hashes,
                                                                   status_flag = DocRawStatus.DATA_PARSED)
Esempio n. 2
0
 def url_exists_in_doc_raw(self, url):
     url_hash = common_utils.gen_url_hash(url)
     return self.doc_raw_adapter.has_doc_raw_by_url_hash(url_hash)
Esempio n. 3
0
    def spider_run(self):
        for url_hash, url, stage, page, encode, context, created_at, page_crawled_at in self.doc_raw_adapter.load_unparsed_doc_raw(
        ):
            try:
                self.logger.log("parsing [%s]." % (url_hash))
                features, images, next_update_time, children = self.parse(
                    url_hash, page, encode, stage, context, created_at,
                    page_crawled_at)
                if images != None:
                    for i in range(0, len(images)):
                        try:
                            image_id = common_utils.gen_url_hash(
                                images[i]["url"])
                            if not self.image_store_adapter.has_image_index_by_image_id(
                                    image_id):
                                images[i]["image_id"] = image_id
                                self.image_store_adapter.create_image_index(
                                    image_id, images[i]["image_format"],
                                    images[i]["url"])
                                self.logger.log(
                                    "image [%s] created for [%s]." %
                                    (image_id, url_hash))
                        except BaseException, e:
                            self.logger.log(
                                "Error occured when creating image index: %s" %
                                (e))

                if features != None:
                    if not self.url_hash_exists_in_data_raw(url_hash):
                        self.data_raw_adapter.create_data_raw(
                            url_hash, url, features, images)
                        self.logger.log("features for [%s] is added." %
                                        (url_hash))
                    else:
                        self.data_raw_adapter.update_data_raw(
                            url_hash, features, images)
                        self.logger.log("features for [%s] is updated." %
                                        (url_hash))

                children_url_hashes = None
                if children != None:
                    children_url_hashes = []
                    for child in children:
                        try:
                            url_new = child["url"]
                            url_hash_new = common_utils.gen_url_hash(
                                child["url"])
                            stage_new = child["stage"]
                            context_new = child["context"]
                            operation_flag = child["operation_flag"]

                            self.explore_child(url_hash, url_new, url_hash_new,
                                               stage_new, context_new,
                                               operation_flag)

                            children_url_hashes.append(url_hash_new)
                        except BaseException, e:
                            self.logger.log(
                                "Error occured when exploring child: %s" % (e))

                self.doc_raw_adapter.update_doc_raw_with_node_info(
                    url_hash,
                    next_update_time=next_update_time,
                    children=children_url_hashes,
                    status_flag=DocRawStatus.DATA_PARSED)
Esempio n. 4
0
        source_name = sys.argv[i]
        seed_path = "seeds/" + source_name + ".xml"
        if os.path.exists(seed_path):
            print("Loading seed from file %s..."%(seed_path))
            f = open(seed_path, "r")
            seeds = BeautifulSoup(f.read())
            f.close()

            doc_raw_adapter = DocRawAdapter(database_config_path, source_name)
            for seed in seeds.findAll("seed"):
                url = seed.url.string
                stage = seed.stage.string
                context = {}
                for content in seed.context.findAll():
                    content_type = content.get("type")
                    if content_type == "int":
                        context[content.name] = int(content.string)
                    else:
                        context[content.name] = content.string
                
                url_hash = common_utils.gen_url_hash(url)
                if not doc_raw_adapter.has_doc_raw_by_url_hash(url_hash):
                    doc_raw_adapter.create_doc_raw(url_hash, url, stage, context)
                    print("%s added into %s"%(url, source_name))
        else:
            print("Can't find seed file %s"%(seed_path))




Esempio n. 5
0
 def url_exists_in_doc_raw(self, url):
     url_hash = common_utils.gen_url_hash(url)
     return self.doc_raw_adapter.has_doc_raw_by_url_hash(url_hash)
Esempio n. 6
0
if __name__ == '__main__':

    for i in range(1, len(sys.argv)):
        source_name = sys.argv[i]
        seed_path = "seeds/" + source_name + ".xml"
        if os.path.exists(seed_path):
            print("Loading seed from file %s..." % (seed_path))
            f = open(seed_path, "r")
            seeds = BeautifulSoup(f.read())
            f.close()

            doc_raw_adapter = DocRawAdapter(database_config_path, source_name)
            for seed in seeds.findAll("seed"):
                url = seed.url.string
                stage = seed.stage.string
                context = {}
                for content in seed.context.findAll():
                    content_type = content.get("type")
                    if content_type == "int":
                        context[content.name] = int(content.string)
                    else:
                        context[content.name] = content.string

                url_hash = common_utils.gen_url_hash(url)
                if not doc_raw_adapter.has_doc_raw_by_url_hash(url_hash):
                    doc_raw_adapter.create_doc_raw(url_hash, url, stage,
                                                   context)
                    print("%s added into %s" % (url, source_name))
        else:
            print("Can't find seed file %s" % (seed_path))