def crawl_data(url, site_id, max_retry_times): """ crawl and save url raw data in local :param url: url :param site_id: site_id as file name :param max_retry_times: max retry times :return: True if success, else False """ success_flag = False if not check_site_by_id(site_id): return success_flag current_retry_count = 0 while current_retry_count < max_retry_times: try: req_headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)' } req = urllib.request.Request(url, headers=req_headers) response = urllib.request.urlopen(req, timeout=60) content = response.read() # Save to file data_file_path = config.URL_CRAWLED_DATA_DIR + str( site_id) + '_new' with open(data_file_path, 'wb') as f: f.write(content) # update last indexed time in DB(haystack/site) engine = mysql_engine() db_session = sessionmaker(bind=engine) session = db_session() session.query(Site).filter(Site.site_id == site_id).update({ Site.last_indexed: datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') }) session.commit() session.close() success_flag = True break except urllib.error.HTTPError as e: print(e.code, e.reason) current_retry_count += 1 print('Retry:', current_retry_count, '/', max_retry_times) continue except urllib.error.URLError as e: print(e.reason) current_retry_count += 1 print('Retry:', current_retry_count, '/', max_retry_times) continue except ConnectionResetError as e: print('ConnectionResetError') time.sleep(random.uniform(0, 2)) current_retry_count += 1 print('Retry:', current_retry_count, '/', max_retry_times) continue except Exception as e: print('Unexpected exception:', str(e)) break return success_flag
def magic_analyzer(site, image_urls): """ analyse url in some magic ways, such as ML :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'} :param image_urls: list :return: [{'method': 'url pattern', 'price', 'currency', 'rule': {'selector', 'weight'}, 'event_time'}] """ if not check_site_by_id(site.get('site_id')): return None try: res = MagicHayStack.get_price(site.get('url'), image_urls) except Exception as e: LOGGER.error(str(e)) if not res: LOGGER.info('[FAIL] magic: price not found') return None else: p = [] for r in res: if r[0]: p.append({ 'method': 'magic', 'price': r[0], 'currency': r[1], 'rule': { 'selector': 'magic', 'weight': r[2] }, 'event_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') }) if not p: LOGGER.info('[FAIL] magic: price not found') return None else: LOGGER.info('[SUCCESS] magic: ' + str(p)) return p
def get_price(site, rule): """ get price according to tag selectors :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'} :param rule: rule of price_tag_selector.json :return: list of price information, [{'method': 'tag pattern', 'price', 'rule': {'selector', 'weight'}, 'event_time'}, ...] """ error_flag = False price_list = [] if not check_site_by_id(site.get('site_id')): return True, None raw_data_path = config.URL_CRAWLED_DATA_DIR data_file_path = raw_data_path + str(site.get('site_id')) new_data_file_path = raw_data_path + str(site.get('site_id')) + '_new' if not os.path.exists(new_data_file_path) and not os.path.exists( data_file_path): LOGGER.warning('tag pattern: cannot crawl data from this url') return True, None if os.path.exists(new_data_file_path): error_flag, price_list, selector_list = parse_price( new_data_file_path, rule.get('containers'), rule.get('selectors')) else: selector_list = rule.get('selectors') if selector_list and len(selector_list) > 0: if os.path.exists(data_file_path): error_flag, price_list2, selector_list2 = parse_price( data_file_path, rule.get('containers'), selector_list) if price_list2: price_list += price_list2 if price_list and len(price_list) > 0: LOGGER.info('[SUCCESS] tag pattern: ' + str(price_list)) return error_flag, price_list else: LOGGER.info('[FAIL] tag pattern: no tag matched') return error_flag, None
def get_price(site, rule): """ main function to parse an url :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'} :param rule: rule in url_profiles.json """ if not check_site_by_id(site.get('site_id')): return False, None, None, None raw_data_path = config.URL_CRAWLED_DATA_DIR data_file_path = raw_data_path + str(site.get('site_id')) new_data_file_path = raw_data_path + str(site.get('site_id')) + '_new' if not os.path.exists(new_data_file_path) and not os.path.exists( data_file_path): LOGGER.warning('url pattern: cannot crawl data from this url') return False, None, None, None # parse url ret = None price = None site_type = None is_list_empty = (True if len(rule['list']) == 0 else False) if not is_list_empty: if os.path.exists(new_data_file_path): for l in rule['list']: ret = parse_list(new_data_file_path, l) if ret: site_type = l['type'] if os.path.exists(data_file_path): os.remove(data_file_path) os.rename(new_data_file_path, data_file_path) break if not ret and os.path.exists(data_file_path): os.remove(new_data_file_path) for l in rule['list']: ret = parse_list(data_file_path, l) if ret: break if ret: temp = find_best_match(ret, site.get('product_name')) price = temp.get('price') if not ret or is_list_empty: site_type = rule['item']['type'] if os.path.exists(new_data_file_path): price = parse_single_price(new_data_file_path, rule['item']['price']) if not price: if os.path.exists(data_file_path): os.remove(new_data_file_path) price = parse_single_price(data_file_path, rule['item']['price']) else: if os.path.exists(data_file_path): os.remove(data_file_path) os.rename(new_data_file_path, data_file_path) elif os.path.exists(data_file_path): price = parse_single_price(data_file_path, rule['item']['price']) if not price: return False, None, None, None else: site_country = rule['country'] update_site(site.get('site_id'), site_type, site_country) return (True, price, rule['currency'], datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
def microdata_filter(site): """ filter site contains microdata follows the schema in 'http://schema.org/Product' :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'} :return: success_flag, currency, price """ success_flag = False if not check_site_by_id(site.get('site_id')): return success_flag, None, None, None products = [] schema_product_type = 'http://schema.org/Product' data_file_path = config.URL_CRAWLED_DATA_DIR + str(site.get('site_id')) new_data_file_path = config.URL_CRAWLED_DATA_DIR + str( site.get('site_id')) + '_new' if not os.path.exists(new_data_file_path) and not os.path.exists( data_file_path): LOGGER.warning('microdata: cannot crawl data from this url') return False, None, None, None items = None if os.path.exists(new_data_file_path): with open(new_data_file_path, 'rb') as f: encoding = chardet.detect(f.read())['encoding'] items = microdata.get_items(f, encoding) if not items: if os.path.exists(new_data_file_path) and os.path.exists( data_file_path): os.remove(new_data_file_path) if os.path.exists(data_file_path): with open(data_file_path, 'rb') as f: encoding = chardet.detect(f.read())['encoding'] items = microdata.get_items(f, encoding) else: if os.path.exists(data_file_path): os.remove(data_file_path) os.rename(new_data_file_path, data_file_path) for item in items: item = json.loads(item.json()) if item.get('type')[0] == schema_product_type and item.get( 'properties').get('offers'): success_flag = True product_name = None product_price = None product_currency = None try: product_name = item.get('properties').get('name')[0] except Exception as e: LOGGER.warning('microdata: ' + str(e)) try: product_price = item.get('properties').get('offers')[0].get( 'properties').get('price')[0] except Exception as e: LOGGER.warning('microdata: ' + str(e)) try: product_currency = item.get('properties').get('offers')[0].get( 'properties').get('priceCurrency')[0] except Exception as e: LOGGER.warning('microdata: ' + str(e)) if product_price: product = { 'name': product_name, 'price': price_formatter(product_price)[0] if product_price else None, 'currency': product_currency } products.append(product) if len(products) == 0: LOGGER.info('[FAIL] microdata: not found') return success_flag, None, None, None elif len(products) == 1: product = products[0] else: product = find_best_match(products, site.get('product_name')) LOGGER.info('[RESULT] microdata: ' + str(product.get('currency')) + ' ' + str(product.get('price'))) return success_flag, product.get('price'), product.get('currency'), \ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')