def get_newly_added_index(SITE_MAP): tree = ET.parse(SITE_MAP) root = tree.getroot() sitemap_list = [] for atype in root.findall( '{http://www.sitemaps.org/schemas/sitemap/0.9}url'): url = atype.find( '{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text if (url.endswith('.html')): sitemap_list.append(url) engine = db_connect() session = get_session(engine) try: spider_list = [ ans.product_url for ans in session.query(ProductIndexModel.product_url).all() ] xor_list = list(set(sitemap_list).symmetric_difference(spider_list)) ans_list = [] for url in xor_list: if (not session.query(exists().where( ProductIndexModel.product_url == url)).scalar()): ans_list.append(url) print('database urls:', len(spider_list), 'sitemap urls:', len(sitemap_list), 'results:', len(ans_list)) return ans_list except IntegrityError as e: print(e) finally: session.close() engine.dispose()
def check_product_detail(): engine = db_connect() session = get_session(engine) try: product_index = session.query(ProductIndexModel).filter(ProductIndexModel.product_name == 'Vicino Table Large Molteni & C').first() product = json.loads(product_index.product.product_detail_information) print(product) except IntegrityError as e: print(e) finally: session.close() engine.dispose()
def fix_Dot_Dot_Dot(): engine = db_connect() session = get_session(engine) header = { 'method': 'GET', 'authority': 'eu1-search.doofinder.com', 'scheme': 'https', 'origin': 'https://www.miliashop.com', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'accept': '*/*', 'referer': 'https://www.miliashop.com/en/', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', } url = 'https://eu1-search.doofinder.com/5/search' params = { 'hashid': 'cf518a4fc64ed58ed14863348a5bae18', 'transformer': 'basic', 'rpp': '50', 'query': '', 'query_counter': '5', 'page': '1' } fix_count = 0 for product_index in session.query(ProductIndexModel).filter( ProductIndexModel.product_name.like('%...%')).all(): print('before: ', product_index.product_name) try: params['query'] = product_index.product_name r = requests.get(url, headers=header, params=params) fix_name = r.json()['results'][0].get('title') if (fix_name == 'Nimrod Low Chair'): print('ERROR: No search result!') continue fix_count += 1 product_index.product_name = fix_name print('after: ', product_index.product_name) session.commit() except (IntegrityError, AttributeError) as e: session.rollback() if (e.orig.args[0] == 1062): # Duplicate entry print('Duplicated entry: ', product_index.product_name) session.delete(product_index) session.commit() session.close() engine.dispose() print('fix ... in database complete, total {} items.'.format(fix_count))
def set_start_urls(): engine = db_connect() session = get_session(engine) r_pool = redis_create_pool() r = redis_connect(r_pool) key = rd_miliashopSpider.redis_key url = 'https://www.miliashop.com/en/sofas/14351-swingus-dedon-2-seater-sofa.html' try: r.lpush(key, url) print('successful insert {} records into redis start_urls.'.format(r.llen(key))) except IntegrityError as e: print(e) finally: r.connection_pool.disconnect() session.close() engine.dispose()
def fix_Deco_in_DB(): fix_count = 0 engine = db_connect() session = get_session(engine) for product_index in session.query(ProductIndexModel).filter( ProductIndexModel.product_name.like('%Dec;ò%')).all(): product_index.product_name = (fixDeco(product_index.product_name)) try: session.commit() except IntegrityError as e: session.rollback() if (e.orig.args[0] == 1062): # Duplicate entry print(product_index.product_name) fix_count += 1 session.delete(product_index) session.commit() session.close() engine.dispose() print('fix deco in database complete, total {} items.'.format(fix_count))
def __init__(self, *a, **kw): super(miliashopSpider, self).__init__(*a, **kw) self.engine = db_connect() self.session = get_session(self.engine)
def __init__(self): self.engine = db_connect() self.sess = get_session(self.engine)
def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) self.engine = db_connect() self.session = get_session(self.engine)