def insertOneRec(self, info): cursor = self.conn.cursor() try: cursor.execute(self.INSERT_SQL, info.asRec()) self.conn.commit() except: L.error("insert failed") self.conn.rollback() cursor.close()
def realFunc(*args, **kwargs): signal.alarm(timeout) try: ret = func(*args, **kwargs) signal.alarm(0) except Exception: L.error("execute timeout") return ER_TIMEOUT return ret
def realFunc(*args, **kwargs): signal.alarm(timeout) try: ret = func(*args, **kwargs) signal.alarm(0) except Exception as e: L.error(e) L.error("execute timeout") return ER_TIMEOUT return ret
def getRecordNum(cursor, QUERY_SQL): try: cursor.execute(QUERY_SQL[0], QUERY_SQL[1]) recNum = cursor.fetchone()[0] except Exception as e: L.error("query record number failed") raise e finally: cursor.close() return recNum
def statis4FlightLowestPriceInfo(duration): conn = g_db.getConn() if not conn: L.error("connect db failed.") return ER_CONN_DB_FAILED handler = FlightLowestPriceInfoHandler(conn) print "FlightLowestPriceInfo:" for i in range(duration, -1, -1): queryDate = (datetime.datetime.today() - datetime.timedelta(days=i)).strftime("%Y-%m-%d") recNum = handler.getRecordNum(query_date=queryDate) print "{} : {}".format(queryDate, recNum)
def searchOne(casperScript, dep, arr, depDate): cmd = "casperjs '{}' '{}' '{}' '{}' '{}' '{}'".format(casperScript, dep[0], dep[1], arr[0], arr[1], depDate) ret, out = commands.getstatusoutput(cmd) if ret != ER_SUCC: # L.error("Execute command[{}] failed, errCode: {}, errMsg: {}".format(cmd, ret, out)) L.error("Execute command[{}] failed, errCode: {}, errMsg:{}", cmd, ret, out) return ret # L.debug("Execute command[{}] succeed, Msg: {}".format(cmd, out)) L.debug("Execute command[{}] succeed", cmd) ret = processDataByFile("/tmp/searchResult.html", depDate, dep[0], arr[0]) return ret
def insertOneRec(self, flightInfo): cursor = self.conn.cursor() try: cursor.execute(self.INSERT_SQL, flightInfo.asRec()) self.conn.commit() except: L.error("insert failed") self.conn.rollback() return ER_INSERT_FAILED cursor.close() return ER_SUCC
def searchRange(casperScript, dep, arr, dateRange, retryTimes): depTime = datetime.datetime.today() d = datetime.timedelta(days=1) for i in range(dateRange): depTime = depTime + d ret = ER_SUCC for j in range(retryTimes): ret = searchOne(casperScript, dep, arr, depTime.strftime("%Y-%m-%d")) if ret == ER_SUCC: L.info("{} -> {} {}", dep[0], arr[0], depTime.strftime("%Y-%m-%d")) break if ER_SUCC != ret: L.error("retry {} times, {} -> {} {} failed", retryTimes, dep[0], arr[0], depTime.strftime("%Y-%m-%d"))
def searchOne(casperScript, dep, arr, depDate): cmd = "casperjs '{}' '{}' '{}' '{}' '{}' '{}'".format( casperScript, dep[0], dep[1], arr[0], arr[1], depDate) ret, out = commands.getstatusoutput(cmd) if ret != ER_SUCC: # L.error("Execute command[{}] failed, errCode: {}, errMsg: {}".format(cmd, ret, out)) L.error("Execute command[{}] failed, errCode: {}, errMsg:{}", cmd, ret, out) return ret # L.debug("Execute command[{}] succeed, Msg: {}".format(cmd, out)) L.debug("Execute command[{}] succeed", cmd) ret = processDataByFile("/tmp/searchResult.html", depDate, dep[0], arr[0]) return ret
def crawlAllAirlinesWithRetry(self, retryTime): self.crawlAllAirlines() lstCur = self.lstFailAirline lstNext = [] for i in range(1, retryTime + 1): if len(lstCur) == 0: return ER_SUCC for it in lstCur: L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0]) startDate = datetime.datetime.today() + datetime.timedelta(days=1) if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate): lstNext.append(it) lstCur = lstNext for it in lstCur: L.error("{} -> {} retry {} times failed", it[0][0], it[1][0], retryTime)
def crawlAllAirlinesWithRetry(self, retryTime): self.crawlAllAirlines() lstCur = self.lstFailAirline lstNext = [] for i in range(1, retryTime + 1): if len(lstCur) == 0: return ER_SUCC for it in lstCur: L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0]) startDate = datetime.datetime.today() + datetime.timedelta( days=1) if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate): lstNext.append(it) lstCur = lstNext for it in lstCur: L.error("{} -> {} retry {} times failed", it[0][0], it[1][0], retryTime)
def analysis(fileName): retList = [] with open(fileName) as f: s = BeautifulSoup(f, 'html.parser') sub = s.find(class_='e_fly_lst') if not sub: return None for airline in sub.children: if 'class' in airline.attrs and len(airline['class']) == 1 and airline['class'][0] == u'avt-column': print airline L.debug("{}", airline['id']) info = airline.contents[0].contents L.debug("{}", info) try: flightNo = getFlightNumber(info[1]) depTime, depAirport = getDepartureTimeAirport(info[2]) elapsedTime = getElapsedTime(info[3]) arrTime, arrAirport = getArriveTimeAirport(info[4]) ptyRate, delayTime = getPunctualityRateDelayTime(info[5]) ticketPrice = getTicketPrice(info[6]) except: L.warning("Get info from airline failed. content = {}", airline) continue rec = (flightNo, depTime, depAirport, arrTime, arrAirport, elapsedTime, ptyRate, delayTime, ticketPrice) retList.append(rec) return retList
def analysis(fileName): retList = [] with open(fileName) as f: s = BeautifulSoup(f, 'html.parser') sub = s.find(class_='e_fly_lst') if not sub: return None for airline in sub.children: if 'class' in airline.attrs and len( airline['class'] ) == 1 and airline['class'][0] == u'avt-column': print airline L.debug("{}", airline['id']) info = airline.contents[0].contents L.debug("{}", info) try: flightNo = getFlightNumber(info[1]) depTime, depAirport = getDepartureTimeAirport(info[2]) elapsedTime = getElapsedTime(info[3]) arrTime, arrAirport = getArriveTimeAirport(info[4]) ptyRate, delayTime = getPunctualityRateDelayTime(info[5]) ticketPrice = getTicketPrice(info[6]) except: L.warning("Get info from airline failed. content = {}", airline) continue rec = (flightNo, depTime, depAirport, arrTime, arrAirport, elapsedTime, ptyRate, delayTime, ticketPrice) retList.append(rec) return retList
def processDataByFile(fileName, depDate, depCode, arrCode): curDateTime = time.localtime(time.time()) queryDate = time.strftime('%Y-%m-%d', curDateTime) queryTime = time.strftime('%H:%M:%S', curDateTime) retList = qunar.analysis(fileName) if not retList: L.error("Not found airline") global g_db conn = g_db.getConn() if not conn: L.error("connect db failed.") return ER_CONN_DB_FAILED handler = FlightInfoHandler(conn) for rec in retList: flightInfo = FlightInfo(queryDate, queryTime, depDate, depCode, arrCode, rec) handler.insertOneRec(flightInfo) return ER_SUCC
def crawlOneAirline(self, depInfo, arrInfo, startDate): curDateTime = time.localtime(time.time()) queryDate = time.strftime('%Y-%m-%d', curDateTime) queryTime = time.strftime('%H:%M:%S', curDateTime) urlBase = "http://flight.qunar.com/twell/flight/farecast.jsp?departureCity={}&arrivalCity={}&nextNDays=0&departureDate={}&searchType=OnewayFlight&searchLangs=zh&locale=zh&serverIP=twell4&allowOld=true&queryID=127.0.0.1%3A1c1ea29%3A113aed2be0b%3A-7bfb&dayNum={}&pageNum=0" url = urlBase.format(depInfo[1], arrInfo[1], startDate, self.dateRange) # print url try: r = requests.get(url, timeout=10) except Exception as e: L.error(e) L.error("{} -> {} timeout, url={}", depInfo[0], arrInfo[0], url) return ER_REQUEST_TIMEOUT if r.status_code != 200: L.error("{} -> {} failed, url={}", depInfo[0], arrInfo[0], url) return ER_RESPONSE_FAIL L.info("{} -> {}", depInfo[0], arrInfo[0]) bs = BeautifulSoup(r.text, 'lxml-xml') resultData = bs.find('ResultData') for airline in resultData.children: if airline.name == 'lowestPrice': d = airline.attrs allAttrs = ('date', 'code', 'depTime', 'arrTime', 'carrier', 'vendorName', 'price') attrOK = True for attr in allAttrs: if attr not in d: attrOK = False break if not attrOK: continue info = FlightLowestPriceInfo(queryDate, queryTime, depInfo[0], arrInfo[0], [d[x] for x in allAttrs]) # print info.asRec() self.dbHandle.insertOneRec(info) return ER_SUCC