def pusanUrl(name, url): subMenuUrls = clsSubMenuUrls() subMenuUrls.name = name subMenuUrls.url = url print subMenuUrls.name.decode('utf-8') + ' : ' + subMenuUrls.url #print >> exceptFile, subMenuUrls + ' : ' + subMenuUrls.url detailProductPusanHtml = savefilegethtml.getHtml(subMenuUrls.url, 'class="container', '<!-- end .ot_tab_style1 -->', 'onlinetourSubPagePusan.txt') for subMenu in detailProductPusanHtml: #if subMenu.find('<li class="">') > -1 and subMenu.find('전체') < 0: if subMenu.find('<li') > -1 and subMenu.find('<a') > -1 and (subMenu.find('전체') < 0 or subMenuUrls.url.find('D50') > -1 or subMenuUrls.url.find('D60') > -1 or subMenuUrls.url.find('D70') > -1): detailRegionUrls = clsDetailRegionUrls() detailRegionUrls.name = tourUtil.getRemovedHtmlTag(subMenu).strip() detailRegionUrls.url = mainUrl + tourUtil.getTagAttr(subMenu, 'a', 'href') subMenuUrls.detailRegionList.append(detailRegionUrls) print detailRegionUrls.name.decode('utf-8') + ' : ' + detailRegionUrls.url #print >> exceptFile, detailRegionUrls.name + ' : ' + detailRegionUrls.url return subMenuUrls
try: for menu in backpackMenuList: #tit_position2 부산출발 검색조건.. print menu.url productListHtml = savefilegethtml.getHtml(menu.url, '<div id="sub_box2">', 'function btn(ckbtn){', 'productListHtml.txt') for each_line in productListHtml: if each_line.find('<h1 class="bic_h">') > -1: productName = each_line.split('bic_h">')[2].split('<')[0] #productNameSplit = productName.split(' ') #period = productNameSplit[len(productNameSplit)-1].replace('일', '') productNameSplit = re.findall( '[\^0-9]+', tourUtil.getRemovedHtmlTag(each_line)) period = productNameSplit[len(productNameSplit) - 1] #if each_line.find('<span class="goods_text">') > -1: # 여행 설명이 잘 적혀 있긴 하지만.. 내일투어에서는 국가 정보가 있는 걸 보조로 가져가야 할듯.. #tourRoute = each_line.split('px;">')[1].split('<')[0] if each_line.find('<span class="travel_box">') > -1: tourRoute = tourUtil.getRemovedHtmlTag(each_line) if each_line.find("sview('") > -1: productCode = each_line.split("sview('")[1].split("'")[0] code2 = each_line.split("sview('")[1].split("'")[2] detailUrl = 'http://www.naeiltour.co.kr/backpack/program_include_list.asp?good_cd=' + productCode + '&sel_ym=' + targetYear + targetMonth print >> exceptFile, 'DetailUrl : ', detailUrl listUrl = 'http://www.naeiltour.co.kr/backpack/program_include_list.asp?good_cd=' productDetailUrl = 'http://www.naeiltour.co.kr/backpack/show.asp?good_cd='
mainpageHtml = savefilegethtml.getHtml('http://www.onlinetour.co.kr/web/home', '<li id="n_pack">', '<!--}} ot_navi-->', 'onlinetourMainPage.txt') # URL 쑤셔넣는 부분... mainMenuList = list() mainMenuUrls = clsMenuUrls() subMenuUrls = clsSubMenuUrls() detailRegionUrls = clsDetailRegionUrls() chkFree = False chkDomestic = False for menuList in mainpageHtml: try: #print menuList if menuList.find('<a href=') > -1 and menuList.find('<li>') < 0: mainMenuUrls = clsMenuUrls() mainMenuUrls.name = tourUtil.getRemovedHtmlTag(menuList).strip() mainMenuUrls.url = tourUtil.getTagAttr(menuList, 'a', 'href') mainMenuUrls.dmst_div = 'A' if menuList.find('부산출발') > -1: mainMenuUrls.departCity = 'PUS' # 부산의 경우.. 세부 지역 URL이 바로 노출되어 있지 않아.. 강제로 쭈셔 넣어 준다.. mainMenuUrls.subMenuList.append(pusanUrl('동남아', 'http://www.onlinetour.co.kr/web/tour?region_cd=D10')) mainMenuUrls.subMenuList.append(pusanUrl('일본', 'http://www.onlinetour.co.kr/web/tour?region_cd=D20')) mainMenuUrls.subMenuList.append(pusanUrl('중국', 'http://www.onlinetour.co.kr/web/tour?region_cd=D30')) mainMenuUrls.subMenuList.append(pusanUrl('괌/사이판', 'http://www.onlinetour.co.kr/web/tour?region_cd=D40')) mainMenuUrls.subMenuList.append(pusanUrl('남태평양', 'http://www.onlinetour.co.kr/web/tour?region_cd=D50')) mainMenuUrls.subMenuList.append(pusanUrl('유럽/특수', 'http://www.onlinetour.co.kr/web/tour?region_cd=D60')) mainMenuUrls.subMenuList.append(pusanUrl('미주/특수', 'http://www.onlinetour.co.kr/web/tour?region_cd=D70')) else: mainMenuUrls.departCity = 'ICN'
#query = savefilegethtml.getMasterMergeQuery('vgtour', mastercode, '', '', productGroupCls.name, productName, tourType, region, productComment, '') # A : 해외(Abroad) #print query productCls = clsProduct() #productListHtml = open('productListHtml.txt') departConfirm = False for product in productListHtml: #print 'product : ' + product if product.find('pro_date') > -1: productCls = clsProduct() departConfirm = False #<td class="pro_date">07/07 (월) 16:15<br/><span>07/09 (수) 21:05</span></td> #<td class="pro_date">07/07 (월) <br/><span></span></td> #<td class="pro_date">07/28 (월) 09:10<br/><span>08/17 (<span style="color:red;margin-bottom:0;">일</span>) 05:50</span></td> daySplit = tourUtil.getNumArray(tourUtil.getRemovedHtmlTag(product)) productCls.sDay = '' productCls.sTime = '' productCls.aDay = '' productCls.aTime = '' if len(daySplit) > 1: productCls.sDay = '2014' + daySplit[0] + daySplit[1] if len(daySplit) > 3: productCls.sTime = daySplit[2] + daySplit[3] if len(daySplit) > 5: productCls.aDay = '2014' + daySplit[4] + daySplit[5] if len(daySplit) > 7: productCls.aTime = daySplit[6] + daySplit[7] #productCls.sDay = '2014' + product.split('pro_date">')[1].split('(')[0].strip().replace('/', '') #productCls.sTime = product.split('<br/>')[0].split(')')[1].strip().replace(':', '')
if period == '' and tourkind == 'F': if product.find('<td class="FRIDAYSPACING" >' ) > -1 and product.find('.gif') > -1: productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] #<td class="FRIDAYSPACING" width="220"><B><FONT COLOR="RED">569,000원</FONT>2박3일 도미인 아사쿠사 호텔(12박 13일)</td> #<td class="FRIDAYSPACING" width="220"><B><FONT COLOR="RED">369,000원</FONT> 2박3일<BR>신주쿠 워싱톤 호텔(더블룸)</td> #<td class="FRIDAYSPACING" width="220"><B><FONT COLOR="RED">569,000원</FONT>(2박3일) 도미인 아사쿠사 호텔</td> if product.find('idth="220">') > -1: print product print type(product) splitText = product.split('박'.decode('utf-8')) tmpText = re.findall('[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) print 'Night : ', tmpText[len(tmpText) - 1] tmpText = re.findall('[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) print 'day : ', tmpText[0] #if product.find('(') > -1: #productCls.night = re.findall(r"\d", product.split('(')[1])[0] #productCls.period = re.findall(r"\d", product.split('(')[1])[1] #elif product.find('[') > -1: #productCls.night = re.findall(r"\d", product.split('[')[1])[0] #productCls.period = re.findall(r"\d", product.split('[')[1])[1] #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', 'productcode', productCls.code, productCls.productname, '20' + productCls.dDay, '', 'ICN',
query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', 'productcode', productCls.code, productCls.productname, '20' + productCls.dDay, '', 'ICN', '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #print query #break if period == '' and tourkind == 'F': if product.find('<td class="FRIDAYSPACING" >') > -1 and product.find('.gif') > -1: productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] #<td class="FRIDAYSPACING" width="220"><B><FONT COLOR="RED">569,000원</FONT>2박3일 도미인 아사쿠사 호텔(12박 13일)</td> #<td class="FRIDAYSPACING" width="220"><B><FONT COLOR="RED">369,000원</FONT> 2박3일<BR>신주쿠 워싱톤 호텔(더블룸)</td> #<td class="FRIDAYSPACING" width="220"><B><FONT COLOR="RED">569,000원</FONT>(2박3일) 도미인 아사쿠사 호텔</td> if product.find('idth="220">') > -1: print product print type(product) splitText = product.split('박'.decode('utf-8')) tmpText = re.findall('[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) print 'Night : ', tmpText[len(tmpText)-1] tmpText = re.findall('[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) print 'day : ', tmpText[0] #if product.find('(') > -1: #productCls.night = re.findall(r"\d", product.split('(')[1])[0] #productCls.period = re.findall(r"\d", product.split('(')[1])[1] #elif product.find('[') > -1: #productCls.night = re.findall(r"\d", product.split('[')[1])[0] #productCls.period = re.findall(r"\d", product.split('[')[1])[1] #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', 'productcode', productCls.code, productCls.productname, '20' + productCls.dDay, '', 'ICN', '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query #break
def searchProduct(filename, productcode, productName, period, targetUrl, listUrl, productDetailUrl, departCity, tourkind, dmst_div, country='', city='', comment=''): detailHtml = savefilegethtml.getHtml(targetUrl, '', '', 'naeiltourDetailHtml.txt') print >> filename, 'TargetUrl : ', targetUrl departDayList = list() for detail_each_line in detailHtml: if detail_each_line.find("fn_goodDetail('") > -1: departDayList.append(detail_each_line.split("fn_goodDetail('")[1].split("'")[0]) # 출발 가능 날짜에 항공사 찾아오는 부분 try: con = cx_Oracle.connect("bigtour/[email protected]:1521/ora11g") codeList = codes.getCityCode(productName, city, comment, country) cityList = codeList[0] nationList = codeList[1] #print nationList #print cityList #print nationList #print cityList query = savefilegethtml.getMasterMergeQueryTest1('naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #query = savefilegethtml.getMasterMergeQuery('naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #print query cursor = con.cursor() cursor.execute(query) con.commit() productCls = clsProduct() for dayInfo in departDayList: try: productListUrl = listUrl + productcode + '&sel_day=' + dayInfo print 'ProductListUrl : ' + productListUrl productListHtml = savefilegethtml.getHtml(productListUrl, '', '', 'naeiltourproductListHtml.txt') print >> filename, 'ProductListUrl : ' + productListUrl for product in productListHtml: try: if product.find("fn_price('") > -1: productCls = clsProduct() productSplit = product.split('fn_price')[1].split("'") productCls.productCode = productSplit[1] productCls.dDay = productSplit[3] productCls.code = productSplit[5] if tourkind == 'W' or tourkind == 'G': productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] else: productCls.airCode = productSplit[7] # 한글 공항코드... but 우리는 영문2자리 공항코드가 필요하다... productCls.price = productSplit[9].replace(',', '') #print productSplit[11] productCls.status = codes.getStatus('naeiltour', productSplit[11]) # 공백 : 예약가능, 03 : 마감임박, 05 : 마감 #if tourkind == 'W': #productCls.city = productSplit[13] productCls.url = productDetailUrl + productcode + '&sel_day=' + productCls.dDay productCls.productname = productName productCls.dTime = '' productCls.aDay = '' productCls.aTime = '' if period != '' and tourkind == 'F': if product.find('<td width="134">') > -1: productCls.period = period #print productCls.toString() productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #query = savefilegethtml.getDetailMergeQuery('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'F': if product.find('<td class="FRIDAYSPACING" >') > -1 and product.find('.gif') > -1: productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ # 날짜 가져오는 부분... 종류가 너무 많아서 좀 수정 if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() if period == '' and tourkind == 'W': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'G': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'D': if product.find('<td class="FRIDAYSPACING" >') > -1 and product.find('.gif') > -1: productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('')[1])[1] else: productCls.night = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[0] productCls.period = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# if product.find('COLOR=BLUE>') > -1: departCity = 'PUS' else: departCity = 'ICN' #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break except cx_Oracle.DatabaseError as err1: print >> filename, err1 pass except: print >> filename, "Depth3 Error:", sys.exc_info()[0] pass #break except: print >> filename, "Depth2 Error:", sys.exc_info()[0] pass except: print >> filename, "Depth1 Error:", sys.exc_info()[0] pass finally: con.close()
productNameHtml = savefilegethtml.htmlToList( productNameHtml, 'xxx.txt') for pdName in productNameHtml: if pdName.find('height="110" alt="') > 0: productNameList.append( pdName.split( 'alt="')[1].split('"')[0].replace( "'", "").strip().decode('utf-8')) # description을.. 다른놈으로 가져가야 할듯.. route로.. #if pdName.find('<p class="desc">') > 0: #productCommentList.append(pdName.split('desc">')[1].split('<')[0].replace("'", "").strip().decode('utf-8')) if pdName.find('<p class="route">') > 0: productCommentList.append( tourUtil.getRemovedHtmlTag( pdName).strip().replace( "'", "").decode('utf-8')) #today = today.replace(month = today.month + 1) codeIdx = 0 for pcode in codeList: detailProduct = pcode.split('s')[1] detailProductUrl = '' if not ( package.menuCode == 'A03' or package.menuCode == 'A06' ): # 출발일정 눌렀을때 List가 펼쳐지는 경우랑, 페이지가 이동하는 경우 나눔.. detailProductUrl = '' #if package.menuCode == 'A01':
def insertData(productCls, detailUrl, regionUrl, tourAgency, kind, dmst_div): print 'Product Url : ', productCls.url print >> exceptFile, 'Product Url : ', productCls.url # 2014. 7. 23. 카테고리의 국가는 넣지 않기로 함... #codeList = codes.getCityCode(productCls.name.decode('utf-8'), detailUrl.name.decode('utf-8'), regionUrl.name.decode('utf-8')) codeList = codes.getCityCode(productCls.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 if len(cityList) == 0 and len(nationList) == 0 and len(continentList) == 0: codeList = codes.getCityCode(detailUrl.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 # Master 상품 입력 query = tourQuery.getMasterMergeQuery(tourAgency, productCls.code, productCls.name.decode('utf-8'), menu.kind, dmst_div, '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() # Region Data 삭제 codes.insertRegionData(tourAgency, productCls.code, cityList, nationList, continentList, siteList) detailProductHtml = savefilegethtml.getHtml(productCls.url, '', '', 'tour2000DetailHtml'+targetMonth+'.txt') pl10Idx = 0 for detailProduct in detailProductHtml: try: if detailProduct.find('<span class="text_pink">') > -1 and detailProduct.find('<a href=') < 0: detailCls = clsProductDetail() numArray = tourUtil.getNumArray(detailProduct) if len(numArray) > 7: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = numArray[2] + numArray[3] detailCls.aDay = targetYear + numArray[4] + numArray[5] detailCls.aTime = numArray[6] + numArray[7] elif len(numArray) == 4: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = '' detailCls.aDay = targetYear + numArray[2] + numArray[3] detailCls.aTime = '' elif detailProduct.find('onError') > -1: detailCls.airCode = detailProduct[detailProduct.find('.gif') - 4:detailProduct.find('.gif') - 2] elif detailProduct.find('text_redB') > -1: numArray = tourUtil.getNumArray(tourUtil.getRemovedHtmlTag(detailProduct)) for num in numArray: detailCls.price += num elif detailProduct.find('</a></td>') > -1: if detailProduct.find('text_pink') > -1: detailCls.status = codes.getStatus('tour2000', '예약가능') elif detailProduct.find('text_blau') > -1: detailCls.status = codes.getStatus('tour2000', '출발가능') elif detailProduct.find('text_green') > -1: detailCls.status = codes.getStatus('tour2000', '대기예약') elif detailProduct.find('text_grayLightSmall') > -1: detailCls.status = codes.getStatus('tour2000', '예약마감') detailCls.remainSeat = tourUtil.getRemovedHtmlTag(detailProduct).replace("'", "").strip() elif detailProduct.find('<p class="pl10">') > -1: if pl10Idx == 0: pl10Idx = 1 detailCls.productName = tourUtil.getRemovedHtmlTag(detailProduct).replace("'", "").strip() detailCls.url = mainUrl + tourUtil.getTagAttr(detailProduct, 'a', 'href') detailCls.productSeq = detailProduct.split('ev_ym=')[1].split('&')[0] + detailProduct.split('ev_seq=')[1].split('&')[0] else: pl10Idx = 0 if detailCls.productName.find('부산출발') > -1: departCity = 'PUS' else: departCity = 'ICN' query = tourQuery.getDetailMergeQuery(tourAgency, productCls.code, detailCls.productSeq, detailCls.productName.decode('utf-8'), detailCls.dDay+detailCls.dTime, detailCls.aDay+detailCls.aTime, productCls.period, departCity, '', detailCls.airCode, detailCls.status, detailCls.url, detailCls.price, '0', '0', '0', '', productCls.night) #print >> exceptFile, query #print query cursor = con.cursor() cursor.execute(query) con.commit() #break except: print >> exceptFile, 'detail parcing Error : ', sys.exc_info()[0] pass
print >> exceptFile, 'List URL : ', defaultproductListUrl productList = urllib2.urlopen(defaultproductListUrl).read() codeList = re.findall(r"goodFocus\w*", productList) productNameList = list() productCommentList = list() productNameHtml = productList[productList.find('travel_top_section'):productList.find('frmGD')] productNameHtml = savefilegethtml.htmlToList(productNameHtml, 'xxx.txt') for pdName in productNameHtml: if pdName.find('height="110" alt="') > 0: productNameList.append(pdName.split('alt="')[1].split('"')[0].replace("'", "").strip().decode('utf-8')) # description을.. 다른놈으로 가져가야 할듯.. route로.. #if pdName.find('<p class="desc">') > 0: #productCommentList.append(pdName.split('desc">')[1].split('<')[0].replace("'", "").strip().decode('utf-8')) if pdName.find('<p class="route">') > 0: productCommentList.append(tourUtil.getRemovedHtmlTag(pdName).strip().replace("'", "").decode('utf-8')) #today = today.replace(month = today.month + 1) codeIdx = 0 for pcode in codeList: detailProduct = pcode.split('s')[1] detailProductUrl = '' if not (package.menuCode == 'A03' or package.menuCode == 'A06'): # 출발일정 눌렀을때 List가 펼쳐지는 경우랑, 페이지가 이동하는 경우 나눔.. detailProductUrl = '' #if package.menuCode == 'A01': detailProductUrl = 'http://www.ybtour.co.kr/Goods/' + urlMap[package.menuCode] + '/inc_evList_ajax.asp?goodCD=' + detailProduct + '&startDT=' + targetYear + targetMonth #detailProductUrl = 'http://www.ybtour.co.kr/Goods/overseas/inc_evList_ajax.asp?goodCD=150201119&startDT=201408'
exceptFileName = 'tour2000Exception' + scrappingStartTime + '.txt' exceptFile = open(exceptFileName, 'w') print >> exceptFile, "Start : %s" % time.ctime() mainUrl = 'http://www.tour2000.co.kr' mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp', '<div class="navi_wholeMenu_box">', '<!-- navi_wholeMenu_wrapper// -->', 'tour2000mainHtml.txt') startMainUrl = False menuList = list() MenuUrlCls = clsMenuUrls() for each_line in mainHtml: if each_line.find('text_pinkB14') > -1: MenuUrlCls = clsMenuUrls() MenuUrlCls.kind = codes.getTourKind(tourAgency, tourUtil.getRemovedHtmlTag(each_line).strip()) startMainUrl = True # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스 if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No': continue if startMainUrl and each_line.find('<li>') > -1: SubMenuCls = clsSubMenu() SubMenuCls.name = tourUtil.getRemovedHtmlTag(each_line).strip() SubMenuCls.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href') MenuUrlCls.subMenuList.append(SubMenuCls) if startMainUrl and each_line.find('</div>') > -1: startMainUrl = False menuList.append(MenuUrlCls)
def insertData(productCls, detailUrl, regionUrl, tourAgency, kind, dmst_div): print 'Product Url : ', productCls.url print >> exceptFile, 'Product Url : ', productCls.url # 2014. 7. 23. 카테고리의 국가는 넣지 않기로 함... #codeList = codes.getCityCode(productCls.name.decode('utf-8'), detailUrl.name.decode('utf-8'), regionUrl.name.decode('utf-8')) codeList = codes.getCityCode(productCls.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 if len(cityList) == 0 and len(nationList) == 0 and len(continentList) == 0: codeList = codes.getCityCode(detailUrl.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 # Master 상품 입력 query = tourQuery.getMasterMergeQuery(tourAgency, productCls.code, productCls.name.decode('utf-8'), menu.kind, dmst_div, '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() # Region Data 삭제 codes.insertRegionData(tourAgency, productCls.code, cityList, nationList, continentList, siteList) detailProductHtml = savefilegethtml.getHtml( productCls.url, '', '', 'tour2000DetailHtml' + targetMonth + '.txt') pl10Idx = 0 for detailProduct in detailProductHtml: try: if detailProduct.find( '<span class="text_pink">') > -1 and detailProduct.find( '<a href=') < 0: detailCls = clsProductDetail() numArray = tourUtil.getNumArray(detailProduct) if len(numArray) > 7: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = numArray[2] + numArray[3] detailCls.aDay = targetYear + numArray[4] + numArray[5] detailCls.aTime = numArray[6] + numArray[7] elif len(numArray) == 4: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = '' detailCls.aDay = targetYear + numArray[2] + numArray[3] detailCls.aTime = '' elif detailProduct.find('onError') > -1: detailCls.airCode = detailProduct[detailProduct.find('.gif') - 4:detailProduct.find('.gif' ) - 2] elif detailProduct.find('text_redB') > -1: numArray = tourUtil.getNumArray( tourUtil.getRemovedHtmlTag(detailProduct)) for num in numArray: detailCls.price += num elif detailProduct.find('</a></td>') > -1: if detailProduct.find('text_pink') > -1: detailCls.status = codes.getStatus('tour2000', '예약가능') elif detailProduct.find('text_blau') > -1: detailCls.status = codes.getStatus('tour2000', '출발가능') elif detailProduct.find('text_green') > -1: detailCls.status = codes.getStatus('tour2000', '대기예약') elif detailProduct.find('text_grayLightSmall') > -1: detailCls.status = codes.getStatus('tour2000', '예약마감') detailCls.remainSeat = tourUtil.getRemovedHtmlTag( detailProduct).replace("'", "").strip() elif detailProduct.find('<p class="pl10">') > -1: if pl10Idx == 0: pl10Idx = 1 detailCls.productName = tourUtil.getRemovedHtmlTag( detailProduct).replace("'", "").strip() detailCls.url = mainUrl + tourUtil.getTagAttr( detailProduct, 'a', 'href') detailCls.productSeq = detailProduct.split( 'ev_ym=')[1].split('&')[0] + detailProduct.split( 'ev_seq=')[1].split('&')[0] else: pl10Idx = 0 if detailCls.productName.find('부산출발') > -1: departCity = 'PUS' else: departCity = 'ICN' query = tourQuery.getDetailMergeQuery( tourAgency, productCls.code, detailCls.productSeq, detailCls.productName.decode('utf-8'), detailCls.dDay + detailCls.dTime, detailCls.aDay + detailCls.aTime, productCls.period, departCity, '', detailCls.airCode, detailCls.status, detailCls.url, detailCls.price, '0', '0', '0', '', productCls.night) #print >> exceptFile, query #print query cursor = con.cursor() cursor.execute(query) con.commit() #break except: print >> exceptFile, 'detail parcing Error : ', sys.exc_info()[0] pass
mainUrl = 'http://www.tour2000.co.kr' mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp', '<div class="navi_wholeMenu_box">', '<!-- navi_wholeMenu_wrapper// -->', 'tour2000mainHtml.txt') startMainUrl = False menuList = list() MenuUrlCls = clsMenuUrls() for each_line in mainHtml: if each_line.find('text_pinkB14') > -1: MenuUrlCls = clsMenuUrls() MenuUrlCls.kind = codes.getTourKind( tourAgency, tourUtil.getRemovedHtmlTag(each_line).strip()) startMainUrl = True # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스 if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No': continue if startMainUrl and each_line.find('<li>') > -1: SubMenuCls = clsSubMenu() SubMenuCls.name = tourUtil.getRemovedHtmlTag(each_line).strip() SubMenuCls.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href') MenuUrlCls.subMenuList.append(SubMenuCls) if startMainUrl and each_line.find('</div>') > -1: startMainUrl = False menuList.append(MenuUrlCls)
def searchProduct(filename, productcode, productName, period, targetUrl, listUrl, productDetailUrl, departCity, tourkind, dmst_div, country='', city='', comment=''): detailHtml = savefilegethtml.getHtml(targetUrl, '', '', 'naeiltourDetailHtml.txt') print >> filename, 'TargetUrl : ', targetUrl departDayList = list() for detail_each_line in detailHtml: if detail_each_line.find("fn_goodDetail('") > -1: departDayList.append( detail_each_line.split("fn_goodDetail('")[1].split("'")[0]) # 출발 가능 날짜에 항공사 찾아오는 부분 try: con = cx_Oracle.connect( "bigtour/[email protected]:1521/ora11g") codeList = codes.getCityCode(productName, city, comment, country) cityList = codeList[0] nationList = codeList[1] #print nationList #print cityList #print nationList #print cityList query = savefilegethtml.getMasterMergeQueryTest1( 'naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #query = savefilegethtml.getMasterMergeQuery('naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #print query cursor = con.cursor() cursor.execute(query) con.commit() productCls = clsProduct() for dayInfo in departDayList: try: productListUrl = listUrl + productcode + '&sel_day=' + dayInfo print 'ProductListUrl : ' + productListUrl productListHtml = savefilegethtml.getHtml( productListUrl, '', '', 'naeiltourproductListHtml.txt') print >> filename, 'ProductListUrl : ' + productListUrl for product in productListHtml: try: if product.find("fn_price('") > -1: productCls = clsProduct() productSplit = product.split('fn_price')[1].split( "'") productCls.productCode = productSplit[1] productCls.dDay = productSplit[3] productCls.code = productSplit[5] if tourkind == 'W' or tourkind == 'G': productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] else: productCls.airCode = productSplit[ 7] # 한글 공항코드... but 우리는 영문2자리 공항코드가 필요하다... productCls.price = productSplit[9].replace(',', '') #print productSplit[11] productCls.status = codes.getStatus( 'naeiltour', productSplit[11] ) # 공백 : 예약가능, 03 : 마감임박, 05 : 마감 #if tourkind == 'W': #productCls.city = productSplit[13] productCls.url = productDetailUrl + productcode + '&sel_day=' + productCls.dDay productCls.productname = productName productCls.dTime = '' productCls.aDay = '' productCls.aTime = '' if period != '' and tourkind == 'F': if product.find('<td width="134">') > -1: productCls.period = period #print productCls.toString() productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #query = savefilegethtml.getDetailMergeQuery('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'F': if product.find('<td class="FRIDAYSPACING" >' ) > -1 and product.find( '.gif') > -1: productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ # 날짜 가져오는 부분... 종류가 너무 많아서 좀 수정 if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() if period == '' and tourkind == 'W': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'G': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'D': if product.find('<td class="FRIDAYSPACING" >' ) > -1 and product.find( '.gif') > -1: productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('')[1])[1] else: productCls.night = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[0] productCls.period = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# if product.find('COLOR=BLUE>') > -1: departCity = 'PUS' else: departCity = 'ICN' #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break except cx_Oracle.DatabaseError as err1: print >> filename, err1 pass except: print >> filename, "Depth3 Error:", sys.exc_info()[0] pass #break except: print >> filename, "Depth2 Error:", sys.exc_info()[0] pass except: print >> filename, "Depth1 Error:", sys.exc_info()[0] pass finally: con.close()
except: print >> exceptFile, "backpack :", sys.exc_info()[0] pass try: for menu in backpackMenuList: #tit_position2 부산출발 검색조건.. print menu.url productListHtml = savefilegethtml.getHtml(menu.url, '<div id="sub_box2">', 'function btn(ckbtn){', 'productListHtml.txt') for each_line in productListHtml: if each_line.find('<h1 class="bic_h">') > -1: productName = each_line.split('bic_h">')[2].split('<')[0] #productNameSplit = productName.split(' ') #period = productNameSplit[len(productNameSplit)-1].replace('일', '') productNameSplit = re.findall('[\^0-9]+', tourUtil.getRemovedHtmlTag(each_line)) period = productNameSplit[len(productNameSplit)-1] #if each_line.find('<span class="goods_text">') > -1: # 여행 설명이 잘 적혀 있긴 하지만.. 내일투어에서는 국가 정보가 있는 걸 보조로 가져가야 할듯.. #tourRoute = each_line.split('px;">')[1].split('<')[0] if each_line.find('<span class="travel_box">') > -1: tourRoute = tourUtil.getRemovedHtmlTag(each_line) if each_line.find("sview('") > -1: productCode = each_line.split("sview('")[1].split("'")[0] code2 = each_line.split("sview('")[1].split("'")[2] detailUrl = 'http://www.naeiltour.co.kr/backpack/program_include_list.asp?good_cd='+ productCode + '&sel_ym=' + targetYear + targetMonth print >> exceptFile, 'DetailUrl : ', detailUrl listUrl = 'http://www.naeiltour.co.kr/backpack/program_include_list.asp?good_cd=' productDetailUrl = 'http://www.naeiltour.co.kr/backpack/show.asp?good_cd='
for each_line in mainpageHtml: #print main if each_line.find('<!--') > -1 : startComment = True elif each_line.find('-->') > -1: startComment = False if firstOversea and each_line.find('해외패키지') > -1: clsMain = mainCls() clsMain.name = codes.getTourKind('lottetour', 'package') firstOversea = False elif not startComment and each_line.find('<li') > -1 and each_line.find('<a href=') > -1: clsSubMenu = subMenuCls() clsSubMenu.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href') if each_line.find('title') > -1: clsSubMenu.name = tourUtil.getRemovedHtmlTag(each_line).strip() clsMain.subMenuList.append(clsSubMenu) else: subMenu = True elif not startComment and subMenu and each_line.find('title=') > -1: clsSubMenu.name = each_line.split('>')[1].split('<')[0] clsMain.subMenuList.append(clsSubMenu) subMenu = False elif each_line.find('sub_depth0') > -1: if len(clsMain.subMenuList) > 0: clsMain.subMenuList.pop() elif each_line.find('class="fit"') > -1 or each_line.find('class="honeymoon _open"') > -1 or each_line.find('class="golf"') > -1 or each_line.find('class="fit"') > -1 or each_line.find('class="cruise line"') > -1 or each_line.find('class="air line"') > -1: mainList.append(clsMain) clsMain = mainCls() clsMain.name = codes.getTourKind('lottetour', tourUtil.getTagAttr(each_line, 'li', 'class'))
#최종 상품들 잡아넣자.. try: productCls = clsProduct() #productListHtml = open('productListHtml.txt') departConfirm = False for product in productListHtml: #print 'product : ' + product if product.find('pro_date') > -1: productCls = clsProduct() departConfirm = False #productCls.sDay = targetYear + product.split('pro_date">')[1].split('(')[0].strip().replace('/', '') #productCls.sTime = product.split('<br/>')[0].split(')')[1].strip().replace(':', '') #productCls.aDay = targetYear + product.split('<span>')[1].split('(')[0].strip().replace('/', '') #productCls.aTime = product.split('<span>')[1].split(')')[1].split('<')[0].strip().replace(':', '') daySplit = tourUtil.getNumArray( tourUtil.getRemovedHtmlTag( product)) productCls.sDay = '' productCls.sTime = '' productCls.aDay = '' productCls.aTime = '' if len(daySplit) > 1: productCls.sDay = targetYear + daySplit[ 0] + daySplit[1] if len(daySplit) > 3: productCls.sTime = daySplit[ 2] + daySplit[3] if len(daySplit) > 5: productCls.aDay = targetYear + daySplit[ 4] + daySplit[5] if len(daySplit) > 7: