def __init__(self, database_config_path, source_name, domain, encode, request_interval): self.logger = Logger("crawler", domain) self.adapter = DocRawAdapter(database_config_path, source_name, self.logger) self.domain = domain self.encode = encode self.request_interval = request_interval
def __init__(self, data_adapter_config_path, source_name, encode="utf-8", parse_try_limit=3): self.logger = Logger("spider", source_name) self.doc_raw_adapter = DocRawAdapter(data_adapter_config_path, source_name, self.logger) self.data_raw_adapter = DataRawAdapter(data_adapter_config_path, source_name, self.logger) self.image_store_adapter = ImageStoreAdapter(data_adapter_config_path, self.logger) self.source_name = source_name self.encode = encode self.parse_try_limit = parse_try_limit self.exploring_times = 0
print("spider_seeds.py source_name_1, [source_name_2, ...]") database_config_path = "../database_config.xml" if __name__ == '__main__': for i in range(1, len(sys.argv)): source_name = sys.argv[i] seed_path = "seeds/" + source_name + ".xml" if os.path.exists(seed_path): print("Loading seed from file %s..." % (seed_path)) f = open(seed_path, "r") seeds = BeautifulSoup(f.read()) f.close() doc_raw_adapter = DocRawAdapter(database_config_path, source_name) for seed in seeds.findAll("seed"): url = seed.url.string stage = seed.stage.string context = {} for content in seed.context.findAll(): content_type = content.get("type") if content_type == "int": context[content.name] = int(content.string) else: context[content.name] = content.string url_hash = common_utils.gen_url_hash(url) if not doc_raw_adapter.has_doc_raw_by_url_hash(url_hash): doc_raw_adapter.create_doc_raw(url_hash, url, stage, context)