Ejemplo n.º 1
0
 def __init__(self, database_config_path, source_name, domain, encode,
              request_interval):
     self.logger = Logger("crawler", domain)
     self.adapter = DocRawAdapter(database_config_path, source_name,
                                  self.logger)
     self.domain = domain
     self.encode = encode
     self.request_interval = request_interval
Ejemplo n.º 2
0
    def __init__(self,
                 data_adapter_config_path,
                 source_name,
                 encode="utf-8",
                 parse_try_limit=3):
        self.logger = Logger("spider", source_name)

        self.doc_raw_adapter = DocRawAdapter(data_adapter_config_path,
                                             source_name, self.logger)
        self.data_raw_adapter = DataRawAdapter(data_adapter_config_path,
                                               source_name, self.logger)
        self.image_store_adapter = ImageStoreAdapter(data_adapter_config_path,
                                                     self.logger)
        self.source_name = source_name
        self.encode = encode
        self.parse_try_limit = parse_try_limit
        self.exploring_times = 0
Ejemplo n.º 3
0
    print("spider_seeds.py source_name_1, [source_name_2, ...]")


database_config_path = "../database_config.xml"
if __name__ == '__main__':

    for i in range(1, len(sys.argv)):
        source_name = sys.argv[i]
        seed_path = "seeds/" + source_name + ".xml"
        if os.path.exists(seed_path):
            print("Loading seed from file %s..." % (seed_path))
            f = open(seed_path, "r")
            seeds = BeautifulSoup(f.read())
            f.close()

            doc_raw_adapter = DocRawAdapter(database_config_path, source_name)
            for seed in seeds.findAll("seed"):
                url = seed.url.string
                stage = seed.stage.string
                context = {}
                for content in seed.context.findAll():
                    content_type = content.get("type")
                    if content_type == "int":
                        context[content.name] = int(content.string)
                    else:
                        context[content.name] = content.string

                url_hash = common_utils.gen_url_hash(url)
                if not doc_raw_adapter.has_doc_raw_by_url_hash(url_hash):
                    doc_raw_adapter.create_doc_raw(url_hash, url, stage,
                                                   context)