Ejemplo n.º 1
0
    def insertOneRec(self, info):
        cursor = self.conn.cursor()
        try:
            cursor.execute(self.INSERT_SQL, info.asRec())
            self.conn.commit()
        except:
            L.error("insert failed")
            self.conn.rollback()

        cursor.close()
Ejemplo n.º 2
0
 def realFunc(*args, **kwargs):
     signal.alarm(timeout)
     try:
         ret = func(*args, **kwargs)
         signal.alarm(0)
     except Exception:
         L.error("execute timeout")
         return ER_TIMEOUT
     
     return ret
Ejemplo n.º 3
0
 def insertOneRec(self, info):
     cursor = self.conn.cursor()
     try:
         cursor.execute(self.INSERT_SQL, info.asRec())
         self.conn.commit()
     except:
         L.error("insert failed")
         self.conn.rollback()
         
     cursor.close()
Ejemplo n.º 4
0
        def realFunc(*args, **kwargs):
            signal.alarm(timeout)
            try:
                ret = func(*args, **kwargs)
                signal.alarm(0)
            except Exception as e:
                L.error(e)
                L.error("execute timeout")
                return ER_TIMEOUT

            return ret
Ejemplo n.º 5
0
def getRecordNum(cursor, QUERY_SQL):
    try:
        cursor.execute(QUERY_SQL[0], QUERY_SQL[1])
        recNum = cursor.fetchone()[0]
    except Exception as e:
        L.error("query record number failed")
        raise e
    finally:
        cursor.close()

    return recNum
Ejemplo n.º 6
0
def getRecordNum(cursor, QUERY_SQL):
    try:
        cursor.execute(QUERY_SQL[0], QUERY_SQL[1])
        recNum = cursor.fetchone()[0]
    except Exception as e:
        L.error("query record number failed")
        raise e
    finally:
        cursor.close()

    return recNum
Ejemplo n.º 7
0
def statis4FlightLowestPriceInfo(duration):
    conn = g_db.getConn()
    if not conn:
        L.error("connect db failed.")
        return ER_CONN_DB_FAILED

    handler = FlightLowestPriceInfoHandler(conn)
    print "FlightLowestPriceInfo:"
    for i in range(duration, -1, -1):
        queryDate = (datetime.datetime.today() - datetime.timedelta(days=i)).strftime("%Y-%m-%d")
        recNum = handler.getRecordNum(query_date=queryDate)
        print "{} : {}".format(queryDate, recNum)
Ejemplo n.º 8
0
def searchOne(casperScript, dep, arr, depDate):
    cmd = "casperjs '{}' '{}' '{}' '{}' '{}' '{}'".format(casperScript, dep[0], dep[1], arr[0], arr[1], depDate)
    ret, out = commands.getstatusoutput(cmd)
    if ret != ER_SUCC:
#         L.error("Execute command[{}] failed, errCode: {}, errMsg: {}".format(cmd, ret, out))
        L.error("Execute command[{}] failed, errCode: {}, errMsg:{}", cmd, ret, out)
        return ret
    
#     L.debug("Execute command[{}] succeed, Msg: {}".format(cmd, out))
    L.debug("Execute command[{}] succeed", cmd)
    ret = processDataByFile("/tmp/searchResult.html", depDate, dep[0], arr[0])
    return ret
Ejemplo n.º 9
0
    def insertOneRec(self, flightInfo):
        cursor = self.conn.cursor()
        try:
            cursor.execute(self.INSERT_SQL, flightInfo.asRec())
            self.conn.commit()
        except:
            L.error("insert failed")
            self.conn.rollback()
            return ER_INSERT_FAILED

        cursor.close()

        return ER_SUCC
Ejemplo n.º 10
0
def statis4FlightLowestPriceInfo(duration):
    conn = g_db.getConn()
    if not conn:
        L.error("connect db failed.")
        return ER_CONN_DB_FAILED

    handler = FlightLowestPriceInfoHandler(conn)
    print "FlightLowestPriceInfo:"
    for i in range(duration, -1, -1):
        queryDate = (datetime.datetime.today() -
                     datetime.timedelta(days=i)).strftime("%Y-%m-%d")
        recNum = handler.getRecordNum(query_date=queryDate)
        print "{} : {}".format(queryDate, recNum)
Ejemplo n.º 11
0
 def insertOneRec(self, flightInfo):
     cursor = self.conn.cursor()
     try:
         cursor.execute(self.INSERT_SQL, flightInfo.asRec())
         self.conn.commit()
     except:
         L.error("insert failed")
         self.conn.rollback()
         return ER_INSERT_FAILED
         
     cursor.close()
     
     return ER_SUCC
Ejemplo n.º 12
0
def searchRange(casperScript, dep, arr, dateRange, retryTimes):
    depTime = datetime.datetime.today()
    d = datetime.timedelta(days=1)
    for i in range(dateRange):
        depTime = depTime + d
        ret = ER_SUCC
        for j in range(retryTimes):
            ret = searchOne(casperScript, dep, arr, depTime.strftime("%Y-%m-%d"))
            if ret == ER_SUCC:
                L.info("{} -> {}  {}", dep[0], arr[0], depTime.strftime("%Y-%m-%d"))
                break
        
        if ER_SUCC != ret:
            L.error("retry {} times, {} -> {}  {} failed", retryTimes, dep[0], arr[0], depTime.strftime("%Y-%m-%d"))
Ejemplo n.º 13
0
def searchOne(casperScript, dep, arr, depDate):
    cmd = "casperjs '{}' '{}' '{}' '{}' '{}' '{}'".format(
        casperScript, dep[0], dep[1], arr[0], arr[1], depDate)
    ret, out = commands.getstatusoutput(cmd)
    if ret != ER_SUCC:
        #         L.error("Execute command[{}] failed, errCode: {}, errMsg: {}".format(cmd, ret, out))
        L.error("Execute command[{}] failed, errCode: {}, errMsg:{}", cmd, ret,
                out)
        return ret


#     L.debug("Execute command[{}] succeed, Msg: {}".format(cmd, out))
    L.debug("Execute command[{}] succeed", cmd)
    ret = processDataByFile("/tmp/searchResult.html", depDate, dep[0], arr[0])
    return ret
Ejemplo n.º 14
0
def searchRange(casperScript, dep, arr, dateRange, retryTimes):
    depTime = datetime.datetime.today()
    d = datetime.timedelta(days=1)
    for i in range(dateRange):
        depTime = depTime + d
        ret = ER_SUCC
        for j in range(retryTimes):
            ret = searchOne(casperScript, dep, arr,
                            depTime.strftime("%Y-%m-%d"))
            if ret == ER_SUCC:
                L.info("{} -> {}  {}", dep[0], arr[0],
                       depTime.strftime("%Y-%m-%d"))
                break

        if ER_SUCC != ret:
            L.error("retry {} times, {} -> {}  {} failed", retryTimes, dep[0],
                    arr[0], depTime.strftime("%Y-%m-%d"))
Ejemplo n.º 15
0
 def crawlAllAirlinesWithRetry(self, retryTime):    
     self.crawlAllAirlines()
     
     lstCur = self.lstFailAirline
     lstNext = []
     for i in range(1, retryTime + 1):
         if len(lstCur) == 0:
             return ER_SUCC
         
         for it in lstCur:
             L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0])
             startDate = datetime.datetime.today() + datetime.timedelta(days=1)
             if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate):
                 lstNext.append(it)
                 
         lstCur = lstNext
         
     for it in lstCur:
         L.error("{} -> {} retry {} times failed", it[0][0], it[1][0], retryTime)
Ejemplo n.º 16
0
    def crawlAllAirlinesWithRetry(self, retryTime):
        self.crawlAllAirlines()

        lstCur = self.lstFailAirline
        lstNext = []
        for i in range(1, retryTime + 1):
            if len(lstCur) == 0:
                return ER_SUCC

            for it in lstCur:
                L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0])
                startDate = datetime.datetime.today() + datetime.timedelta(
                    days=1)
                if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate):
                    lstNext.append(it)

            lstCur = lstNext

        for it in lstCur:
            L.error("{} -> {} retry {} times failed", it[0][0], it[1][0],
                    retryTime)
Ejemplo n.º 17
0
def analysis(fileName):
    retList = []
    with open(fileName) as f:
        s = BeautifulSoup(f, 'html.parser')
        sub = s.find(class_='e_fly_lst')
        if not sub:
            return None
        for airline in sub.children:
            if 'class' in airline.attrs and len(airline['class']) == 1 and airline['class'][0] == u'avt-column':
                print airline
                L.debug("{}", airline['id'])
                info = airline.contents[0].contents
                L.debug("{}", info)
                try:
                    flightNo = getFlightNumber(info[1])
                    depTime, depAirport = getDepartureTimeAirport(info[2])
                    elapsedTime = getElapsedTime(info[3])
                    arrTime, arrAirport = getArriveTimeAirport(info[4])
                    ptyRate, delayTime = getPunctualityRateDelayTime(info[5])
                    ticketPrice = getTicketPrice(info[6])
                except:
                    L.warning("Get info from airline failed. content = {}", airline)
                    continue
                
                rec = (flightNo, depTime, depAirport, arrTime, arrAirport, elapsedTime, ptyRate, delayTime, ticketPrice)
                retList.append(rec)
    return retList
Ejemplo n.º 18
0
def analysis(fileName):
    retList = []
    with open(fileName) as f:
        s = BeautifulSoup(f, 'html.parser')
        sub = s.find(class_='e_fly_lst')
        if not sub:
            return None
        for airline in sub.children:
            if 'class' in airline.attrs and len(
                    airline['class']
            ) == 1 and airline['class'][0] == u'avt-column':
                print airline
                L.debug("{}", airline['id'])
                info = airline.contents[0].contents
                L.debug("{}", info)
                try:
                    flightNo = getFlightNumber(info[1])
                    depTime, depAirport = getDepartureTimeAirport(info[2])
                    elapsedTime = getElapsedTime(info[3])
                    arrTime, arrAirport = getArriveTimeAirport(info[4])
                    ptyRate, delayTime = getPunctualityRateDelayTime(info[5])
                    ticketPrice = getTicketPrice(info[6])
                except:
                    L.warning("Get info from airline failed. content = {}",
                              airline)
                    continue

                rec = (flightNo, depTime, depAirport, arrTime, arrAirport,
                       elapsedTime, ptyRate, delayTime, ticketPrice)
                retList.append(rec)
    return retList
Ejemplo n.º 19
0
def processDataByFile(fileName, depDate, depCode, arrCode):
    curDateTime = time.localtime(time.time())
    
    queryDate = time.strftime('%Y-%m-%d', curDateTime)
    queryTime = time.strftime('%H:%M:%S', curDateTime)
    
    retList = qunar.analysis(fileName)
    if not retList:
        L.error("Not found airline")

    global g_db
    conn = g_db.getConn()
    if not conn:
        L.error("connect db failed.")
        return ER_CONN_DB_FAILED
    
    handler = FlightInfoHandler(conn)
    
    for rec in retList:
        flightInfo = FlightInfo(queryDate, queryTime, depDate, depCode, arrCode, rec)
        handler.insertOneRec(flightInfo)
        
    return ER_SUCC
Ejemplo n.º 20
0
    def crawlOneAirline(self, depInfo, arrInfo, startDate):
        curDateTime = time.localtime(time.time())
        queryDate = time.strftime('%Y-%m-%d', curDateTime)
        queryTime = time.strftime('%H:%M:%S', curDateTime)
        urlBase = "http://flight.qunar.com/twell/flight/farecast.jsp?departureCity={}&arrivalCity={}&nextNDays=0&departureDate={}&searchType=OnewayFlight&searchLangs=zh&locale=zh&serverIP=twell4&allowOld=true&queryID=127.0.0.1%3A1c1ea29%3A113aed2be0b%3A-7bfb&dayNum={}&pageNum=0"
        url = urlBase.format(depInfo[1], arrInfo[1], startDate, self.dateRange)
        #         print url
        try:
            r = requests.get(url, timeout=10)
        except Exception as e:
            L.error(e)
            L.error("{} -> {} timeout, url={}", depInfo[0], arrInfo[0], url)
            return ER_REQUEST_TIMEOUT

        if r.status_code != 200:
            L.error("{} -> {} failed, url={}", depInfo[0], arrInfo[0], url)
            return ER_RESPONSE_FAIL

        L.info("{} -> {}", depInfo[0], arrInfo[0])
        bs = BeautifulSoup(r.text, 'lxml-xml')
        resultData = bs.find('ResultData')
        for airline in resultData.children:
            if airline.name == 'lowestPrice':
                d = airline.attrs
                allAttrs = ('date', 'code', 'depTime', 'arrTime', 'carrier',
                            'vendorName', 'price')
                attrOK = True
                for attr in allAttrs:
                    if attr not in d:
                        attrOK = False
                        break
                if not attrOK:
                    continue

                info = FlightLowestPriceInfo(queryDate, queryTime, depInfo[0],
                                             arrInfo[0],
                                             [d[x] for x in allAttrs])
                #                 print info.asRec()
                self.dbHandle.insertOneRec(info)

        return ER_SUCC
Ejemplo n.º 21
0
    def crawlOneAirline(self, depInfo, arrInfo, startDate):
        curDateTime =  time.localtime(time.time())
        queryDate = time.strftime('%Y-%m-%d', curDateTime)
        queryTime = time.strftime('%H:%M:%S', curDateTime)
        urlBase = "http://flight.qunar.com/twell/flight/farecast.jsp?departureCity={}&arrivalCity={}&nextNDays=0&departureDate={}&searchType=OnewayFlight&searchLangs=zh&locale=zh&serverIP=twell4&allowOld=true&queryID=127.0.0.1%3A1c1ea29%3A113aed2be0b%3A-7bfb&dayNum={}&pageNum=0"
        url = urlBase.format(depInfo[1], arrInfo[1], startDate, self.dateRange)
#         print url
        try:
            r = requests.get(url, timeout=10)
        except Exception as e:
            L.error(e)
            L.error("{} -> {} timeout, url={}", depInfo[0], arrInfo[0], url)
            return ER_REQUEST_TIMEOUT
            
        if r.status_code != 200:
            L.error("{} -> {} failed, url={}", depInfo[0], arrInfo[0], url)
            return ER_RESPONSE_FAIL
        
        L.info("{} -> {}", depInfo[0], arrInfo[0])
        bs = BeautifulSoup(r.text, 'lxml-xml')
        resultData = bs.find('ResultData')
        for airline in resultData.children:
            if airline.name == 'lowestPrice':
                d = airline.attrs
                allAttrs = ('date', 'code', 'depTime', 'arrTime', 'carrier', 'vendorName', 'price')
                attrOK = True
                for attr in allAttrs:
                    if attr not in d:
                        attrOK = False
                        break
                if not attrOK:
                    continue        
                    
                info = FlightLowestPriceInfo(queryDate, queryTime, depInfo[0], arrInfo[0], [d[x] for x in allAttrs])
#                 print info.asRec()
                self.dbHandle.insertOneRec(info)
                
        return ER_SUCC