def execute_sql(sql): service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() cursor.execute(sql, []) service_platform_conn.commit() cursor.close() service_platform_conn.close()
def _execute(self, **kwargs): dept_info = self.task.kwargs['dept_info'] dest_info = self.task.kwargs['dest_info'] source = self.task.kwargs['source'] error_code, result, page_store_key = GT_to_database( tid=self.task.task_id, used_times=self.task.used_times, vacation_type=self.task.kwargs['vacation_type'], source=source, ticket=self.task.kwargs, need_cache=self.task.used_times == 0) db[source + 'GT_list'].save({ 'collections': self.task.collection, 'task_id': self.task.task_id, 'used_times': self.task.used_times[0], 'stored_page_keys': page_store_key, 'dept_info': dept_info, 'dest_info': dest_info, 'result': result, 'source': self.task.kwargs['source'], 'insert_time': datetime.datetime.now() }) self.task.error_code = error_code sql = SQL.format(self.task.task_name) data = [] if source == 'ctrip': for res in result: data.append( (source, res['pid_3rd'], dept_info['id'], dest_info['id'], json.dumps(res))) elif source == 'tuniu': for res in result: data.append( (source, res['id'], dept_info['id'], dest_info['id'], json.dumps(res))) try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() _res = cursor.executemany(sql, data) service_platform_conn.commit() cursor.close() service_platform_conn.close() self.task.get_data_per_times = len(data) self.task.list_task_insert_db_count = _res except Exception as e: raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) if error_code == 27 or len(data) > 0: self.task.error_code = 0 else: raise ServiceStandardError( error_code=ServiceStandardError.EMPTY_TICKET) return len(result), error_code
def insert(sql, data): # TODO 连接池处理未指定 service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() _res = cursor.executemany(sql, data) service_platform_conn.commit() cursor.close() service_platform_conn.close() return _res
def hilton_recover(): sql = '''select * from detail_hotel_hilton_20180420a''' conn_s = service_platform_pool.connection() cursor = conn_s.cursor() cursor.execute(sql) query_result = cursor.fetchall() for i in query_result: print(i[4]) collections.update_many({'args.source_id': i[4]}, {'$set': {'finished': 1}}, False, True)
def hotel_list_insert_daodao(table_name, res_data): try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() sel_sql = 'select id, localtion_id, source_list from {} where source_id = %s'.format(table_name) rep_sql = "replace into {} (id, name, name_en, city_id, country_id, `from`, source_id, localtion_id, status, source_list) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)".format( table_name) ins_sql = "insert into {} (name, name_en, city_id, country_id, `from`, source_id, localtion_id, status, source_list) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)".format( table_name) cursor.execute(sel_sql, source_id) exists_localtion = {} for id, localtion_id, source_list in cursor.fetchall(): print id, localtion_id, source_list exists_localtion[localtion_id] = (id, json.loads(source_list or "{}")) new_hotels_and_not_id = [] if exists_localtion: for _i, line in enumerate(res_data): new_hotels = {} mylocaltion_id, myhotels = line[6], line[8] if exists_localtion.has_key(mylocaltion_id): yourid, yourhotels = exists_localtion[mylocaltion_id] mykeys = myhotels.keys() yourkeys = yourhotels.keys() if len(mykeys) > len(yourkeys): for k in mykeys: new_hotels[k] = myhotels.get(k) or yourhotels.get(k) else: for k in yourkeys: new_hotels[k] = yourhotels.get(k) or myhotels.get(k) line[8] = json.dumps(new_hotels) line.insert(0, yourid) else: line[8] = json.dumps(line[8] or {}) new_hotels_and_not_id.append(line) res_data = filter(lambda x:len(x)==10, res_data) cursor.executemany(rep_sql, res_data) service_platform_conn.commit() if new_hotels_and_not_id: cursor.executemany(ins_sql, new_hotels_and_not_id) service_platform_conn.commit() if not exists_localtion: for line in res_data: line[8] = json.dumps(line[8]) cursor.executemany(ins_sql, res_data) service_platform_conn.commit() cursor.close() service_platform_conn.close() except Exception as e: self.logger.exception(msg="[mysql error]", exc_info=e) raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
def insert_error_f_md5_file(file_name, error_md5, file_md5, img_type): conn = service_platform_pool.connection() cursor = conn.cursor() if img_type == 'hotel': insert_sql = '''INSERT IGNORE INTO error_f_md5_file (file_name, file_md5, error_md5) VALUES (%s, %s, %s);''' else: insert_sql = '''INSERT IGNORE INTO error_f_md5_file_poi (file_name, file_md5, error_md5) VALUES (%s, %s, %s);''' cursor.execute(insert_sql, (file_name, file_md5, error_md5)) conn.commit() cursor.close() conn.close()
def _execute(self, **kwargs): city_id = self.task.kwargs['city_id'] country_id = self.task.kwargs['country_id'] check_in = self.task.kwargs['check_in'] city_url = self.task.kwargs['city_url'] error_code, result, page_store_key, types_result_num = qyer_list_to_database( tid=self.task.task_id, used_times=self.task.used_times, source=self.task.kwargs['source'], city_id=city_id, check_in=check_in, city_url=city_url, need_cache=self.task.used_times == 0) collections.save({ 'collections': self.task.collection, 'task_id': self.task.task_id, 'used_times': self.task.used_times[0], 'total_num': types_result_num, 'stored_page_keys': page_store_key, 'result': result, 'insert_time': datetime.datetime.now() }) self.task.error_code = error_code sql = SQL.format(self.task.task_name) data = [] for sid, url, page_id, branch, tag in result: data.append(('qyer', sid, city_id, country_id, url)) try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() _res = cursor.executemany(sql, data) service_platform_conn.commit() cursor.close() service_platform_conn.close() self.task.get_data_per_times = len(data) self.task.list_task_insert_db_count = _res except Exception as e: raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) if len(data) > 0: self.task.error_code = 0 else: raise ServiceStandardError( error_code=ServiceStandardError.EMPTY_TICKET) return result, error_code
def get_seek(task_name): sql = """select seek, priority, sequence from task_seek where task_name = '%s'""" conn = service_platform_pool.connection() cursor = conn.cursor() cursor.execute(sql % task_name) result = cursor.fetchone() cursor.close() conn.close() logger.info('timestramp, priority : %s ' % str(result)) if not result or len(result) == 0: return get_default_timestramp(), PRIORITY, 0 return result
def hotel_list_insert_db(table_name, res_data): try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() sql = "replace into {} (name, name_en, city_id, country_id, `from`, status, source_list) VALUES (%s,%s,%s,%s,%s,%s,%s)".format( table_name) cursor.executemany(sql, res_data) service_platform_conn.commit() cursor.close() service_platform_conn.close() except Exception as e: self.logger.exception(msg="[mysql error]", exc_info=e) raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
def execute_sql(sql, commit=False): conn = service_platform_pool.connection() cursor = conn.cursor() logger.info(sql) cursor.execute(sql) if commit: conn.commit() cursor.close() conn.close() return result = cursor.fetchall() logger.info('row_count : %s ' % cursor.rowcount) cursor.close() conn.close() return result
def hotel_list_insert_db(): try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() sql = "INSERT IGNORE INTO {} (source, source_id, city_id, country_id, hotel_url) VALUES (%s,%s,%s,%s,%s)".format( self.task.task_name) _res = cursor.executemany(sql, res_data) service_platform_conn.commit() cursor.close() service_platform_conn.close() self.task.list_task_insert_db_count = _res self.task.get_data_per_times = len(res_data) except Exception as e: self.logger.exception(msg="[mysql error]", exc_info=e) raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
def create_table(table_name): conn = service_platform_pool.connection() cursor = conn.cursor() tab_args = table_name.split('_') if tab_args[0] == 'detail': sql_file = LOAD_FILES[tab_args[1]] elif tab_args[0] == 'list': sql_file = LOAD_FILES['list'] elif tab_args[0] == 'images': if tab_args[1] == 'hotel': sql_file = LOAD_FILES['images_hotel'] elif tab_args[2] in ('daodao', 'qyer'): sql_file = LOAD_FILES['images_daodao'] cursor.execute(sql_file % table_name) logger.info('已创建表: %s' % table_name) cursor.close() conn.close()
def insert_all_data(): global update_data_list global task_data_list global is_new_task # init insert conn and cursor task_conn = service_platform_pool.connection() task_cursor = task_conn.cursor() data_conn = spider_data_poi_pool.connection() data_cursor = data_conn.cursor() # update task table and insert data table if is_new_task: _res = task_cursor.executemany( '''UPDATE pic_detect_task_new SET status=2 WHERE id=%s;''', update_data_list) else: _res = task_cursor.executemany( '''UPDATE pic_detect_task SET status=2 WHERE id=%s;''', update_data_list) logger.debug("[update status finished][update task: {}]".format(_res)) _res = data_cursor.executemany( '''INSERT IGNORE INTO PoiPictureInformation (city_id, poi_id, pic_name, is_scaned) VALUES (%s, %s, %s, 0); ''', task_data_list) logger.debug( "[update status finished][insert poi face detect task: {}]".format( _res)) # 由于事务关系原因,当两个 sql 均运行成功后才入库,否则不 commit task_conn.commit() task_cursor.close() task_conn.close() data_conn.commit() data_cursor.close() data_conn.close()
def _execute(self, **kwargs): url = self.task.kwargs['url'] source = self.task.kwargs['source'] source_id = self.task.kwargs['source_id'] city_id = self.task.kwargs['city_id'] country_id = self.task.kwargs['country_id'] hid = self.task.kwargs['hid'] headers = {} other_info = {'source_id': source_id, 'city_id': city_id} if source in ['starwood', 'hyatt', 'gha', 'shangrila', 'fourseasons']: error_code, res, page_store_key_list = hotel_detail_database( url, source) if error_code == 0: result = parse_hotel_info(res) else: raise ServiceStandardError(error_code=error_code) else: with MySession(need_cache=True) as session: # booking start if source == 'booking': headers['Referer'] = 'http://www.booking.com' # booking end session.headers.update(headers) start = time.time() if source not in ('hilton', 'ihg', 'holiday', 'accor', 'marriott'): page = session.get(url, timeout=240) page.encoding = 'utf8' content = page.text elif source == 'ihg': url1, url2 = url.split('#####') page1 = session.get(url1, timeout=240) page1.encoding = 'utf8' content1 = page1.text page2 = session.get(url2, timeout=240) page2.encoding = 'utf8' content2 = page2.text content = [content1, content2] elif source == 'holiday': url2, url1 = url.split('#####') page1 = requests.get( url1, headers={ 'x-ihg-api-key': 'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y', 'ihg-language': 'zh-CN' }, timeout=240) page1.encoding = 'utf8' content1 = page1.text page2 = requests.get( url2, timeout=240, headers={ 'accept': 'application/json, text/plain, */*', 'Content-Type': 'application/json; charset=UTF-8', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'ihg-language': 'zh-CN', }) page2.encoding = 'utf8' content2 = page2.text page3 = requests.get(url1, headers={ 'x-ihg-api-key': 'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y' }, timeout=240) page3.encoding = 'utf8' content3 = page3.text content = (content1, content2, content3) elif source == 'accor': proxy_url = "http://10.10.239.46:8087/proxy?source=pricelineFlight&user=crawler&passwd=spidermiaoji2014" r = requests.get(proxy_url) proxies = {'https': "socks5://" + str(r.text)} headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" } page = requests.get(url, headers=headers, verify=False, proxies=proxies) page.encoding = 'utf8' content = page.text elif source == 'marriott': url_list = url.split('#####') url = url_list[0] for i in url_list: if len(i.split('=')) > 1: key, value = i.split('=')[0], i.split('=')[1] if key == 'longtitude': other_info['longtitude'] = value if key == 'latitude': other_info['latitude'] = value else: if url_list.index(i) == 1: other_info['hotel_name_en'] = i url2 = url.replace("travel", "hotel-photos") url3 = url.replace("travel/", "maps/travel/") url4 = url.replace("hotels/", "hotels/fact-sheet/") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0' } if "https://www.marriott.com" in url: page1 = requests.get(url, headers=headers, timeout=240) page2 = requests.get(url2, headers=headers, timeout=240) page3 = requests.get(url3, headers=headers, timeout=240) page4 = requests.get(url4, headers=headers, timeout=240) page1.encoding = 'utf8' page2.encoding = 'utf8' page3.encoding = 'utf8' page4.encoding = 'utf8' content1 = page1.text content2 = page2.text content3 = page3.text content4 = page4.text content = (content1, content2, content3, content4) else: url2 = url + "/hotel-overview" page1 = requests.get(url, headers=headers, timeout=240) page2 = requests.get(url2, headers=headers, timeout=240) page1.encoding = 'utf8' page2.encoding = 'utf8' content1 = page1.text content2 = page2.text content = (content1, content2) else: session.auto_update_host = False hilton_index = url.find('index.html') if hilton_index > -1: url = url[:hilton_index] split_args = url.split('/') detail_url = 'http://www3.hilton.com/zh_CN/hotels/{0}/{1}/popup/hotelDetails.html'.format( split_args[-3], split_args[-2]) map_info_url = url + 'maps-directions.html' desc_url = url + 'about.html' page = session.get(url) map_info_page = session.get(map_info_url) desc_page = session.get(desc_url) detail_page = session.get(detail_url, ) page.encoding = 'utf8' detail_page.encoding = 'utf8' map_info_page.encoding = 'utf8' desc_page.encoding = 'utf8' __content = page.text logger.info(detail_url) __detail_content = detail_page.text __map_info_content = map_info_page.text __desc_content = desc_page.text content = [ __content, __detail_content, __map_info_content, __desc_content ] logger.debug("[crawl_data][Takes: {}]".format(time.time() - start)) start = time.time() result = parse_hotel(content=content, url=url, other_info=other_info, source=source, part=self.task.task_name, retry_count=self.task.used_times) logger.debug("[parse_hotel][func: {}][Takes: {}]".format( parse_hotel.func_name, time.time() - start)) try: data_collections = mongo_data_client['ServicePlatform'][ self.task.task_name] data_collections.create_index([('source', 1), ('source_id', 1)], unique=True, background=True) data_collections.create_index([('location', '2dsphere')], background=True) tmp_result = deepcopy(result.values(backdict=True)) lon, lat = str(result.map_info).split(',') lon, lat = float(lon), float(lat) tmp_result.update( {'location': { 'type': "Point", 'coordinates': [lon, lat] }}) data_collections.save(tmp_result) except pymongo.errors.DuplicateKeyError: # logger.exception("[result already in db]", exc_info=e) logger.warning("[result already in db]") except Exception as exc: raise ServiceStandardError( error_code=ServiceStandardError.MONGO_ERROR, wrapped_exception=exc) start = time.time() try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() others_info = json.loads(result.others_info) others_info['hid'] = hid result.others_info = json.dumps(others_info) sql = result.generation_sql() sql = sql.format(table_name=self.task.task_name) values = result.values() self.logger.info(result.__dict__) cursor.execute(sql, values) service_platform_conn.commit() cursor.close() service_platform_conn.close() except Exception as e: logger.exception(e) raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) logger.debug("[Insert DB][Takes: {}]".format(time.time() - start)) self.task.error_code = 0 return self.task.error_code
# h.is_parking_free = '0' # h.service = '只有床' # h.img_items = "https://www.'baidu.com|https:'//www.baidu.com" # h.description = '0.5星级酒店' # h.accepted_cards = 'AAA' # h.check_in_time = '2017-09-09 12:23:45' # h.check_out_time = '2017-09-09 18:23:45' # h.hotel_url = 'https://map.baidu.com' h.update_time = datetime.datetime.now() # h.continent = 'NULL' h.other_info = json.dumps({'json': '个性不'}) print h sql = h.generation_sql() values = h.values() print values sql = sql.format(table_name='aaaa') # sql = sql % values print sql from proj.mysql_pool import service_platform_pool service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() cursor.execute(sql, values) cursor.close() service_platform_conn.close() # c = Coordinate() # c.latitude = '123.12343' # c.longitude = '2345.4123' # print c
def _execute(self, **kwargs): id = kwargs['id'] source = kwargs['source'] url = kwargs['url'] table_name = kwargs['table_name'] t1 = time.time() with MySession(need_cache=False, need_proxies=True) as session: resp_daodao = session.get(url) content_daodao = resp_daodao.text real_daodao_url = re.findall(r"redirectPage\('(.*)'\)", content_daodao)[0] # print content real_url = None print id, source, url # self.logger.info('%s\n%s\n%s\n' % (source, url, content[-600:])) try: if source == 'agoda': url_obj = urlparse.urlsplit(real_daodao_url) hid = urlparse.parse_qs(url_obj.query)['hid'][0] line = SourceIDS.find_one({'source_id': 'agoda'+hid}) assert line, '没找到agoda的hid' real_url = line['url'] elif source == 'booking': real_url = real_daodao_url.replace('http', 'https').replace('.html', '.zh-cn.html').split('?', 1)[0] elif source == 'ctrip': base_url = 'http://hotels.ctrip.com/international/{0}.html' url_obj = urlparse.urlsplit(real_daodao_url) try: hotel_id = urlparse.parse_qs(url_obj.query)['hotelid'][0] except: jumpUrl = urlparse.parse_qs(url_obj.query)['jumpUrl'][0] hotel_id = re.findall('/name(\w+)', urlparse.urlsplit(jumpUrl).path)[0] real_url = base_url.format(hotel_id) elif source == 'expedia': raise Exception('我是expedia') elif source == 'hotels': base_url = 'https://ssl.hotels.cn/ho{0}/' url_obj = urlparse.urlsplit(real_daodao_url) hotel_id = urlparse.parse_qs(url_obj.query)['hotelid'][0] real_url = base_url.format(hotel_id) else: raise Exception('我是别的源') except: resp = session.get(real_daodao_url) content = resp.text if source == 'agoda': agoda_json = re.search(r'window.searchBoxReact = (.*)(?=;)', content).group(1) agoda_json = json.loads(agoda_json) url = agoda_json.get('recentSearches', [])[0].get('data', {}).get('url') if url.startswith('/zh-cn'): base_url = 'https://www.agoda.com' elif url.startswith('zh-cn'): base_url = 'https://www.agoda.com/' else: base_url = 'https://www.agoda.com/zh-cn' real_url = ''.join([base_url, url]) elif source == 'booking': base_url = 'https://www.booking.com' root = html.fromstring(content) real_url = root.xpath('//div[@id="hotellist_inner"]//a')[0] real_url = ''.join([base_url, real_url.attrib.get('href').replace('-cn', '-com')]) elif source == 'ctrip': ctrip_json = re.search(r'hotelPositionJSON: (.*)(?=,)', content).group(1) ctrip_json = json.loads(ctrip_json) url = ctrip_json[0].get('url') base_url = "http://hotels.ctrip.com" real_url = ''.join([base_url, url]) elif source == 'elong': # hotel_id = re.search(r'hotelId":"([0-9]+)"', content).group(1) hotel_id = re.search(r'data-hotelid="(\w+)"', content).group(1) real_url = 'http://ihotel.elong.com/{0}/'.format(hotel_id) elif source == 'expedia': raise Exception('我是expedia') elif source == 'hotels': root = html.fromstring(content) hotel_id = root.xpath('//div[@id="listings"]//li')[0].attrib.get('data-hotel-id') real_url = "https://ssl.hotels.cn/ho{0}/?pa=1&q-check-out=2018-04-16&tab=description&q-room-0-adults=2&YGF=7&q-check-in=2018-04-15&MGT=1&WOE=1&WOD=7&ZSX=0&SYE=3&q-room-0-children=0".format( hotel_id) print real_url t2 = time.time() self.logger.info('抓取耗时: {}'.format(t2 - t1)) service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() sql = "update {0} set source_list = JSON_REPLACE(source_list, '$.{1}', %s) where id = %s".format(table_name, source); cursor.execute(sql, (real_url, id)) service_platform_conn.commit() cursor.close() service_platform_conn.close() t3 = time.time() self.logger.info('入库耗时: {}'.format(t3 - t2)) self.task.error_code = 0 return id, source, self.task.error_code