Esempio n. 1
0
    def run(self):
        totalTestCount = 0
        totalWaitingTime = 0
        successCount = 0
        failCount = 0

        # Not support non http protocol for now.
        if self.proxy.protocol != 'http':
            return

        for i in range(0, self.totalTestNumber):
            success, waitingTime = self.makeRequest()
            if success:
                successCount += 1
                totalWaitingTime += waitingTime
            else:
                failCount += 1
                totalWaitingTime += 10000
            totalTestCount += 1
            if failCount >= self.acceptableFailTestNumber:
                break
            time.sleep(1)

        if successCount >= 1:
            averageTime = float(totalWaitingTime) / successCount
        else:
            averageTime = 9999999
        self.proxy.averageTime = averageTime
        self.proxy.testCount = totalTestCount
        self.proxy.successCount = successCount

        log.debug('Finish single proxy test:  ' +\
            str(self.proxy.averageTime) + '    ' +\
            str(self.proxy.successCount) + '/' + str(self.proxy.testCount) +\
            '    ' + self.proxy.getAllString())
Esempio n. 2
0
 def run(self):
     try:
         opener = urllib2.build_opener()
         html = opener.open(self.url, timeout=10)
         proxies = self.parser.parse(html, self.url)
         for proxy in proxies:
             self.importer.addProxy(proxy)
         log.debug('Crawl proxies from ' + str(self.url) + ':')
         for proxy in proxies:
             log.debug('>>>>>' + proxy.getAllString())
     except Exception, e:
         log.warning('Crawling thread exception: ' + str(e))
Esempio n. 3
0
 def makeRequest(self):
     success = False
     startTime = time.time()
     try:
         protocol = self.proxy.protocol
         if not protocol:
             # Default to HTTP protocol
             protocol = 'http'
         proxy_handler = urllib2.ProxyHandler({
             protocol.lower(): self.proxy.getProxyString()
         })
         opener = urllib2.build_opener(proxy_handler)
         response = opener.open(self.url, timeout=5)
         response.read()
         success = True
     except Exception, e:
         log.debug('Fail on proxy test:  ' + str(e) + ' ' +\
             self.proxy.getAllString())
Esempio n. 4
0
 def parseProfileHtml(self, html):
     """Returns: {(UserInfo, ErrorCode)} the user information and errorcode.
     """
     document = BeautifulSoup(html)
     errorCode = ErrorCode.LACK_CRITICAL_INFO
     pagetype = -1
     info = None
     if document.find('div', id = 'timeline'):
         # Timeline profile page
         log.debug('The page type is: Timeline')
         pagetype = PageType.TIME_LINE
         info = self.parseTimelineProfilePage(document)
     else:
         # Old profile page
         if document.find('div', id = 'visitors'):
             # With access permission
             log.debug('The page type is: old with access')
             pagetype = PageType.OLD_WITH_ACCESS
             info = self.parseOldProfilePageWithAccess(document)
         elif document.find('div', id = 'allFrdGallery'):
             # Without access permission
             log.debug('The page type is: old without access')
             pagetype = PageType.OLD_WITHOUT_ACCESS
             info = self.parseOldProfilePageWithoutAccess(document)
         else:
             # Unknown page tamplate
             errorCode = ErrorCode.UNKNOWN_PAGE
     if info:
         info.pagetype = pagetype
         info.html = html
         return (info, ErrorCode.OK)
     else:
         # Parse error
         return (None, errorCode)
Esempio n. 5
0
 def parseOldProfilePageWithAccess(self, document):
     """Returns: {UserInfo} the user information."""
     info = UserInfo()
     # Name & visitedNum
     try:
         holderNode = document.find('div', class_='status-holder')
         info.name = holderNode.find('h1', class_='username').string
         info.visitedNum = int(
             holderNode.find('span', class_='count')
             .find('span', class_='count').string)
     except:
         return None  # Lack of critical information
     summaryNode = document.find('div', class_='profile-summary')
     # Gender
     try:
         # sayHiText is something like: '向他打招呼'
         sayHiText = summaryNode.ul.find('a', stats='pf_poke').string
         if re.compile(u'.*他.*').match(sayHiText):
             info.gender = 'male'
         elif re.compile(u'.*她.*').match(sayHiText):
             info.gender = 'female'
     except Exception, e:
         log.debug('Exception in gender: ' + str(e))
         pass
Esempio n. 6
0
 except:
     pass
 # Hometown
 try:
     hometownNode = ul.find('li', class_='hometown')
     hometownString = ''
     # TODO: Some times they don't user <a> to palce hometown string
     for aTag in hometownNode.find_all('a'):
         hometownString += aTag.string + ' '
     if not hometownNode.find_all('a'):
         # Use string to directly represent hometown
         pattern = re.compile('\S*\s(.*)')
         hometownString = pattern.match(hometownNode.string).group(1)
     info.hometown = hometownString
 except Exception, e:
     log.debug('Exception in hometown: ' + str(e))
     pass
 # Residence
 try:
     residenceNode = ul.find('li', class_='address')
     placeString = ''
     # TODO: Some times they don't user <a> to palce hometown string
     for aTag in residenceNode.find_all('a'):
         placeString += aTag.string + ' '
     if not residenceNode.find_all('a'):
         # Use string to directly represent hometown
         pattern = re.compile('\S*\s(.*)')
         placeString = pattern.match(residenceNode.string).group(1)
     info.residence = placeString
 except Exception, e:
     log.debug('Exception in Residence: ' + str(e))