def getParam(self, line): #dicts = dict() self.startLocation = line[line.find('startLocation=') + len('startLocation='):line.find('amp;') - 1] line = line[line.find('amp;') + len('amp;'):] #print(line) self.id = line[line.find('id=') + len('id='):line.find('amp;') - 1] line = line[line.find('amp;') + len('amp;'):] #print(line) self.type = line[line.find('type=') + len('type='):line.find('amp;') - 1] line = line[line.find('amp;') + len('amp;'):] #print(line) self.MLoc = line[line.find('MLoc=') + len('MLoc='):line.find(' ') - 1] #self.tourType = getTourType(line.split('<span>')[1].split('<')[0]) self.tourType = codes.getTourKind('modetour', line.split('<span>')[1].split('<')[0])
def getTourType(idx): if idx == 0: return codes.getTourKind('verygoodtour', 'P') elif idx == 1: return codes.getTourKind('verygoodtour', 'F') elif idx == 2: return codes.getTourKind('verygoodtour', 'D') elif idx == 3: return codes.getTourKind('verygoodtour', 'PUS') elif idx == 4: return codes.getTourKind('verygoodtour', 'W') elif idx == 5: return codes.getTourKind('verygoodtour', 'G') elif idx == 6: return codes.getTourKind('verygoodtour', 'Luxury') elif idx == 7: return codes.getTourKind('verygoodtour', 'Air') elif idx == 8: return codes.getTourKind('verygoodtour', 'Hotel') elif idx == 9: return codes.getTourKind('verygoodtour', 'Company') else: return 'No'
def getParam(self, line): #dicts = dict() self.startLocation = line[line.find('startLocation=') + len('startLocation='):line.find('amp;') - 1] line = line[line.find('amp;') + len('amp;'):] #print(line) self.id = line[line.find('id=') + len('id='):line.find('amp;') - 1] line = line[line.find('amp;') + len('amp;'):] #print(line) self.type = line[line.find('type=') + len('type='):line.find('amp;') - 1] line = line[line.find('amp;') + len('amp;'):] #print(line) self.MLoc = line[line.find('MLoc=') + len('MLoc='):line.find(' ') - 1] #self.tourType = getTourType(line.split('<span>')[1].split('<')[0]) self.tourType = codes.getTourKind( 'modetour', line.split('<span>')[1].split('<')[0])
for each_line in homepageHtml: if each_line.find('<ul id="city') > -1: if len(productGroupCls.tourkindgroup) > 0: menulist.append(productGroupCls) productGroupCls = clsTotalGroup() if each_line.find('city1') > -1: productGroupCls.departCity = 'ICN' elif each_line.find('city2') > -1: productGroupCls.departCity = 'PUS' else: productGroupCls.departCity = 'TAE' elif each_line.find('href="/submain/?') > -1 or each_line.find('href="/SubMain/index.asp?') > -1 or (each_line.find('<li>') < 0 and (each_line.find('Areaindex.asp') > -1 or each_line.find('areaindex.asp') > -1)): tourkindGroupCls = clsTourKindGroup() tourkindGroupCls.url = each_line.split('href="')[1].split('">')[0] #tourkindGroupCls.tourkind = each_line.split('>')[1].split('<')[0] # Code명 통일하자.. tourkindGroupCls.tourkind = codes.getTourKind('tourbaksa', each_line.split('>')[1].split('<')[0].strip().decode('cp949')) elif each_line.find('<li>') > -1 and each_line.find('<!--') < 0 and each_line.find('-->') < 0 and (each_line.find('Areaindex') > -1 or each_line.find('areaindex') > -1 or each_line.find('M1=') > -1): regionUrlGroupCls = clsRegionUrlGroup() regionUrlGroupCls.region = each_line.split('</a>')[0].split('">')[1] regionUrlGroupCls.url = homepageUrl + each_line.split('href="')[1].split('"')[0] tourkindGroupCls.regionUrlGroup.append(regionUrlGroupCls) elif each_line.find('</ul>') > -1: if productGroupCls.tourkindgroup.count(tourkindGroupCls) < 1: productGroupCls.tourkindgroup.append(tourkindGroupCls) elif each_line.find('class="etcMenu"') > -1: menulist.append(productGroupCls) #homepageHtml.close() exceptFile = open('tourbaksaException'+scrappingStartTime+'.txt', 'w') print >> exceptFile, "Start : %s" % time.ctime()
startComment = False firstOversea = True subMenu = False mainList = list() clsMain = mainCls() clsSubMenu = subMenuCls() for each_line in mainpageHtml: #print main if each_line.find('<!--') > -1 : startComment = True elif each_line.find('-->') > -1: startComment = False if firstOversea and each_line.find('해외패키지') > -1: clsMain = mainCls() clsMain.name = codes.getTourKind('lottetour', 'package') firstOversea = False elif not startComment and each_line.find('<li') > -1 and each_line.find('<a href=') > -1: clsSubMenu = subMenuCls() clsSubMenu.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href') if each_line.find('title') > -1: clsSubMenu.name = tourUtil.getRemovedHtmlTag(each_line).strip() clsMain.subMenuList.append(clsSubMenu) else: subMenu = True elif not startComment and subMenu and each_line.find('title=') > -1: clsSubMenu.name = each_line.split('>')[1].split('<')[0] clsMain.subMenuList.append(clsSubMenu) subMenu = False elif each_line.find('sub_depth0') > -1: if len(clsMain.subMenuList) > 0:
mainMenuUrls.url = tourUtil.getTagAttr(menuList, 'a', 'href') mainMenuUrls.dmst_div = 'A' if menuList.find('부산출발') > -1: mainMenuUrls.departCity = 'PUS' # 부산의 경우.. 세부 지역 URL이 바로 노출되어 있지 않아.. 강제로 쭈셔 넣어 준다.. mainMenuUrls.subMenuList.append(pusanUrl('동남아', 'http://www.onlinetour.co.kr/web/tour?region_cd=D10')) mainMenuUrls.subMenuList.append(pusanUrl('일본', 'http://www.onlinetour.co.kr/web/tour?region_cd=D20')) mainMenuUrls.subMenuList.append(pusanUrl('중국', 'http://www.onlinetour.co.kr/web/tour?region_cd=D30')) mainMenuUrls.subMenuList.append(pusanUrl('괌/사이판', 'http://www.onlinetour.co.kr/web/tour?region_cd=D40')) mainMenuUrls.subMenuList.append(pusanUrl('남태평양', 'http://www.onlinetour.co.kr/web/tour?region_cd=D50')) mainMenuUrls.subMenuList.append(pusanUrl('유럽/특수', 'http://www.onlinetour.co.kr/web/tour?region_cd=D60')) mainMenuUrls.subMenuList.append(pusanUrl('미주/특수', 'http://www.onlinetour.co.kr/web/tour?region_cd=D70')) else: mainMenuUrls.departCity = 'ICN' mainMenuUrls.tourType = codes.getTourKind(tourAgency, mainMenuUrls.name) print mainMenuUrls.name.decode('utf-8') + ' : ' + mainMenuUrls.url + ' : ' + mainMenuUrls.tourType #print >> exceptFile, mainMenuUrls.name + ' : ' + mainMenuUrls.url + ' : ' + mainMenuUrls.tourType if menuList.find('국내여행') > -1: chkDomestic = True elif not chkDomestic and menuList.find('<li><a href=') > -1 and menuList.find('region_cd=') > -1 and menuList.find('전체') < 0: subMenuUrls = clsSubMenuUrls() subMenuUrls.name = tourUtil.getRemovedHtmlTag(menuList).strip() subMenuUrls.url = tourUtil.getTagAttr(menuList, 'a', 'href') print subMenuUrls.name.decode('utf-8') + ' : ' + subMenuUrls.url #print >> exceptFile, 'subMenuUrls : ' + subMenuUrls.url detailProductHtml = savefilegethtml.getHtml(subMenuUrls.url, 'class="container', '<!-- end .ot_tab_style1 -->', 'onlinetourSubPage.txt')
if each_line.find('city1') > -1: productGroupCls.departCity = 'ICN' elif each_line.find('city2') > -1: productGroupCls.departCity = 'PUS' else: productGroupCls.departCity = 'TAE' elif each_line.find('href="/submain/?') > -1 or each_line.find( 'href="/SubMain/index.asp?') > -1 or ( each_line.find('<li>') < 0 and (each_line.find('Areaindex.asp') > -1 or each_line.find('areaindex.asp') > -1)): tourkindGroupCls = clsTourKindGroup() tourkindGroupCls.url = each_line.split('href="')[1].split('">')[0] #tourkindGroupCls.tourkind = each_line.split('>')[1].split('<')[0] # Code명 통일하자.. tourkindGroupCls.tourkind = codes.getTourKind( 'tourbaksa', each_line.split('>')[1].split('<')[0].strip().decode('cp949')) elif each_line.find('<li>') > -1 and each_line.find( '<!--') < 0 and each_line.find('-->') < 0 and ( each_line.find('Areaindex') > -1 or each_line.find('areaindex') > -1 or each_line.find('M1=') > -1): regionUrlGroupCls = clsRegionUrlGroup() regionUrlGroupCls.region = each_line.split('</a>')[0].split('">')[1] regionUrlGroupCls.url = homepageUrl + each_line.split( 'href="')[1].split('"')[0] tourkindGroupCls.regionUrlGroup.append(regionUrlGroupCls) elif each_line.find('</ul>') > -1: if productGroupCls.tourkindgroup.count(tourkindGroupCls) < 1: productGroupCls.tourkindgroup.append(tourkindGroupCls) elif each_line.find('class="etcMenu"') > -1:
print 'Main URL : ' + mainUrl print >> exceptFile, mainUrl packageListXml = urllib2.urlopen(mainUrl).read() packageListDict = xmltodict.parse(packageListXml) urlMap = dict() urlMap['A01'] = 'overseas' # overseas urlMap['A03'] = 'airtel' # airtel urlMap['A06'] = 'Honeymoon' # Honeymoon urlMap['A09'] = 'Overseas' # Golf urlMap['A12'] = 'Overseas' # 국내 여행... but.. 주소는 Overseas를 사용하네.. urlMap['A15'] = 'Overseas' # 지역 출발... but 주소는 Overseas를 사용 urlMap['A18'] = 'Overseas' # Cruise but 주소는 Overseas packageMap = dict() packageMap['A01'] = codes.getTourKind('ybtour', 'P') packageMap['A03'] = codes.getTourKind('ybtour', 'F') packageMap['A06'] = codes.getTourKind('ybtour', 'W') packageMap['A09'] = codes.getTourKind('ybtour', 'G') packageMap['A12'] = codes.getTourKind('ybtour', 'D') packageMap['A15'] = codes.getTourKind('ybtour', 'PUS') packageMap['A18'] = codes.getTourKind('ybtour', 'C') for pack in packageListDict['ROOT']['List']: try: package = classPackage() package.menuCode = pack['MenuCD'] package.menuName = pack['MenuNM'] package.goodTypeCode = pack['GoodTypeCD'] package.sbar = pack['SBAR'] ml1List.append(package)
print >> exceptFile, 'Start City : ', html.split('province_')[1].split('_')[0] return html.split('province_')[1].split('_')[0] else: print >> exceptFile, 'Start City : ICN' return 'ICN' # 시간 변수들.. tourAgency = 'hanatour' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") mainUrls = list() packageCls = clsMenuUrls(codes.getTourKind('hanatour', 'P'), 'http://www.hanatour.com/asp/booking/oversea/oversea-main.asp?hanacode=overseas_M_bi') # Package honeymonCls = clsMenuUrls(codes.getTourKind('hanatour', 'W'), 'http://www.hanatour.com/asp/booking/honeymoon/hr-main.asp?hanacode=main_q_pack_honey') # Honeymoon golfCls = clsMenuUrls(codes.getTourKind('hanatour', 'G'), 'http://www.hanatour.com/asp/booking/golf/golf-main.asp?hanacode=main_q_pack_golf') # Golf cruiseUrl = clsMenuUrls(codes.getTourKind('hanatour', 'C'), 'http://www.hanatour.com/asp/booking/cruise/cruise-main.asp?hanacode=main_q_pack_cruise') # Cruise jejuUrl = clsMenuUrls(codes.getTourKind('hanatour', 'D'), 'http://www.hanatour.com/asp/booking/local/local-cheju.asp?hanacode=main_q_dom_jeju') # Jeju mainUrls.append(packageCls) mainUrls.append(honeymonCls) mainUrls.append(golfCls) mainUrls.append(cruiseUrl) mainUrls.append(jejuUrl) #productPackage/pk- 값이 존재하고... etc_code=P 인것..이 패키지 #pkg_mst_code 값이 있는 경우는.. 바로 세부조회 내용임...(날짜 선택하는..) 이런 경우도 있김 있음.. #etc_code=W/P/A/B/K/Y/J/C 'W' : honeymoon, 'A': free, 'P' : package, 'B' : AirTel, 'K' : Tracking, 'Y' : Leports, 'J' : 성지순례, 'C' : Cruise #</form><span class="free_go">
exceptFileName = 'tour2000Exception' + scrappingStartTime + '.txt' exceptFile = open(exceptFileName, 'w') print >> exceptFile, "Start : %s" % time.ctime() mainUrl = 'http://www.tour2000.co.kr' mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp', '<div class="navi_wholeMenu_box">', '<!-- navi_wholeMenu_wrapper// -->', 'tour2000mainHtml.txt') startMainUrl = False menuList = list() MenuUrlCls = clsMenuUrls() for each_line in mainHtml: if each_line.find('text_pinkB14') > -1: MenuUrlCls = clsMenuUrls() MenuUrlCls.kind = codes.getTourKind(tourAgency, tourUtil.getRemovedHtmlTag(each_line).strip()) startMainUrl = True # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스 if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No': continue if startMainUrl and each_line.find('<li>') > -1: SubMenuCls = clsSubMenu() SubMenuCls.name = tourUtil.getRemovedHtmlTag(each_line).strip() SubMenuCls.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href') MenuUrlCls.subMenuList.append(SubMenuCls) if startMainUrl and each_line.find('</div>') > -1: startMainUrl = False menuList.append(MenuUrlCls)
mainUrl = 'http://www.tour2000.co.kr' mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp', '<div class="navi_wholeMenu_box">', '<!-- navi_wholeMenu_wrapper// -->', 'tour2000mainHtml.txt') startMainUrl = False menuList = list() MenuUrlCls = clsMenuUrls() for each_line in mainHtml: if each_line.find('text_pinkB14') > -1: MenuUrlCls = clsMenuUrls() MenuUrlCls.kind = codes.getTourKind( tourAgency, tourUtil.getRemovedHtmlTag(each_line).strip()) startMainUrl = True # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스 if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No': continue if startMainUrl and each_line.find('<li>') > -1: SubMenuCls = clsSubMenu() SubMenuCls.name = tourUtil.getRemovedHtmlTag(each_line).strip() SubMenuCls.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href') MenuUrlCls.subMenuList.append(SubMenuCls) if startMainUrl and each_line.find('</div>') > -1: startMainUrl = False
else: print >> exceptFile, 'Start City : ICN' return 'ICN' # 시간 변수들.. tourAgency = 'hanatour' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") mainUrls = list() packageCls = clsMenuUrls(codes.getTourKind( 'hanatour', 'P' ), 'http://www.hanatour.com/asp/booking/oversea/oversea-main.asp?hanacode=overseas_M_bi' ) # Package honeymonCls = clsMenuUrls(codes.getTourKind( 'hanatour', 'W' ), 'http://www.hanatour.com/asp/booking/honeymoon/hr-main.asp?hanacode=main_q_pack_honey' ) # Honeymoon golfCls = clsMenuUrls(codes.getTourKind( 'hanatour', 'G' ), 'http://www.hanatour.com/asp/booking/golf/golf-main.asp?hanacode=main_q_pack_golf' ) # Golf cruiseUrl = clsMenuUrls(codes.getTourKind( 'hanatour', 'C' ), 'http://www.hanatour.com/asp/booking/cruise/cruise-main.asp?hanacode=main_q_pack_cruise' ) # Cruise jejuUrl = clsMenuUrls(codes.getTourKind(