def load_crawler_configuration(self, path): files = load_config(path) try: result = load_yaml(f) except Exception as e: logging.error("Error {} ".format(str(e))) return result
def generate_file(filename, data): filename = get_path(filename) try: with open("{}".format(filename), "w+") as f: f.write(data) f.close() return True except Exception as e: logging.error(str(e)) return False
def create_confdir(CONF_PATH=CONF_PATH, DUMP_PATH=DUMP_LOCATION, HTML_PATH=HTML_LOCATION): CONF_DIR = [CONF_PATH, DUMP_LOCATION, HTML_LOCATION] for path in CONF_DIR: check = create_dir(path) if not check: logging.error("Creating Directory Failed") return return True
def load_config(path): if os.path.exists(path): if os.path.isdir(path): files_ = get_all(path) return files_ else: return [path] else: logging.error("File/Folder does not exists !") return False
def __init__(self, **kwargs): self.base_url = None self.company_name = None self.product_list = list() self.currency = None if kwargs: for key, value in kwargs.items(): try: setattr(self, key, value) except: logging.error("Can not set value") pass
def get_loaded(driver, count=0): try: myElem = WebDriverWait(driver, 3).until( EC.presence_of_element_located((By.XPATH, '//body'))) return True except TimeoutException: driver.refresh() logging.error("Loading is too long") if count < 3: get_loaded(count=count + 1) else: return False
def load_crawler_configuration(path): files = load_config(path) f_yaml = list() for f in files: try: tmp = load_yaml(f) except Exception as e: logging.error("Error {} ".format(str(e))) continue else: f_yaml.append(tmp) return f_yaml
def get_element(self, elements): driver = self.driver to_element = None for locator_, value in elements.items(): delay = 3 type_ = elementFilterTool[locator_] try: myElem = WebDriverWait(driver, delay).until( EC.presence_of_element_located((type_, value))) except TimeoutException: logging.error("Loading took too much time!") to_element = driver.find_element(type_, value) return to_element
def run(path=CONF_PATH, force_headless=False, force_dump=True, dump_to_json=False, dump_location=DUMP_LOCATION): crawler_result = list() configs = load_crawler_configuration(path) failure = {"send": list(), "scrape": list()} for datas in configs: result = list() result_ = {"company": None, "data": list()} write_to_json = {"company": None, "data": list()} cfg = datas[0] cfg = flatten_dictionaries(cfg['config']) cfg['company_name'] = cfg.pop('name') product_details = {} for row in datas: if 'product' not in row: continue else: prods = row['product'] prods_ = flatten_dictionaries(prods) d = ProductCrawler(cfg, is_headless=force_headless, **prods_) _company_details = d.company_detail d.config_worker() d.register_company() try: dd = d.run() except Exception as e: logging.error(str(e)) else: normalized_data = d.normalize(dd) d.write_result(normalized_data) for key, value in normalized_data.items(): if not value: failure['scrape'].append(d.endpoint) break _tmp = d.crawler_result() if d.dump_to_database: result_['data'].append(_tmp) write_to_json['data'].append(_tmp) d.driver.quit() result.append(dd) result_['company'] = _company_details write_to_json['company'] = _company_details if dump_json_data: dump_json_data(write_to_json) crawler_result.append(result_) return crawler_result
def create_dir(path): try: os.mkdir(path) except FileNotFoundError: head = os.path.split(path)[0] tail = os.path.split(path)[1] create_dir(head) return create_dir(path) except FileExistsError: return path except Exception as e: logging.error(e) return False finally: return path
def scrape(self, configs): runner_configs = self.runner_configs status = None result = {"company": None, "data": list()} result_ = {"company": None, "data": list()} write_to_json = {"company": None, "data": list()} for config in configs: config['status'] = dict() crawler = config['config'] _company_details = crawler.company_detail crawler.config_worker() # crawler.register_company() try: scraped_data = crawler.run() except Exception as e: logging.error(str(e)) config['status']['scrape'] = { "status": False, "message": str(e) } else: config['status']['scrape'] = { "status": True, "message": "Success" } normalized_data = crawler.normalize(scraped_data) if not normalized_data: return configs first_check = self.check_duplicate(normalized_data) cleaned_data = first_check crawler.write_result(cleaned_data) for key, value in normalized_data.items(): if not value: config['status']['scrape'] = { "status": False, "message": "No Result!" } break _tmp = crawler.crawler_result() crawler.driver.quit() result['company'] = _company_details result['data'].append(crawler.crawler_result()) self.scrape_result = result self.flattened_data.append(crawler.flatten_data_result()) return configs
def config_worker(self): self.worker = Worker(self.is_headless) worker = self.worker url = self.get_url() worker.get(url) wait = get_loaded(worker.driver) if not wait: logging.error("Page not loaded") self.driver = worker.driver if self.window_size: logging.debug("Resizing {} x {}".format(self.window_size_x, self.window_size_y)) self.driver.set_window_size(int(self.window_size_x), int(self.window_size_y)) else: self.driver.maximize_window() self.action = worker.action self.config_action_chains()
def send_data(es_handler, datasets): try: res = es_handler.search(index="domain_type") domain_type = [i["_source"]["type"] for i in res["hits"]["hits"]] except Exception : domain_type = ['.id', '.com', '.xyz', '.net', '.org', '.co.id', '.web.id', '.my.id', '.biz.id', '.ac.id', '.sch.id', '.biz', '.co', '.tv', '.io', '.info'] result = list() for i in datasets: if i['_index'] == 'domain': if i['nm_domain_type'].lower() not in domain_type: continue try: res = es_handler.index(index=i.pop("_index"),id=i.pop("_id"),body=i) except Exception as e: logging.error(str(e)) print(str(e)) res = {"status": False} result.append(res) return result
def normalize(self, dataset): result = list() for data in dataset: product_type = data['nm_product_type'] try: self.config[product_type] except KeyError: msg = "index {} must be initiated on configuration" logging.error(msg) return list() else: __datatmp = dict() __datatmp["_index"] = product_type for key, val in self.config[product_type].items(): required = self.config[product_type][key].get( 'required', False) if required: try: __datatmp[key] = data[key] except KeyError: logging.error(data) msg = "{} is required for product type {}".format( key, product_type) logging.error(msg) return list() else: __datatmp[key] = data.get(key, "None") _id = json.dumps(__datatmp) _id = generate_id(_id) __datatmp['_id'] = _id result.append(__datatmp) if 'additional_features' in data.keys(): parent_id = _id __datatmp_add = { "_parent_id": parent_id, "_parent_index": product_type, "_index": "additional_features" } __datatmp_add = { **__datatmp_add, **data['additional_features'] } _id = json.dumps(__datatmp_add) _id = generate_id(_id) __datatmp_add["_id"] = _id result.append(__datatmp_add) return result
def run(self): try: self.generate_actions(self.query) self.execute() except Exception as e: logging.error(str(e))