def run(self): totalTestCount = 0 totalWaitingTime = 0 successCount = 0 failCount = 0 # Not support non http protocol for now. if self.proxy.protocol != 'http': return for i in range(0, self.totalTestNumber): success, waitingTime = self.makeRequest() if success: successCount += 1 totalWaitingTime += waitingTime else: failCount += 1 totalWaitingTime += 10000 totalTestCount += 1 if failCount >= self.acceptableFailTestNumber: break time.sleep(1) if successCount >= 1: averageTime = float(totalWaitingTime) / successCount else: averageTime = 9999999 self.proxy.averageTime = averageTime self.proxy.testCount = totalTestCount self.proxy.successCount = successCount log.debug('Finish single proxy test: ' +\ str(self.proxy.averageTime) + ' ' +\ str(self.proxy.successCount) + '/' + str(self.proxy.testCount) +\ ' ' + self.proxy.getAllString())
def run(self): try: opener = urllib2.build_opener() html = opener.open(self.url, timeout=10) proxies = self.parser.parse(html, self.url) for proxy in proxies: self.importer.addProxy(proxy) log.debug('Crawl proxies from ' + str(self.url) + ':') for proxy in proxies: log.debug('>>>>>' + proxy.getAllString()) except Exception, e: log.warning('Crawling thread exception: ' + str(e))
def makeRequest(self): success = False startTime = time.time() try: protocol = self.proxy.protocol if not protocol: # Default to HTTP protocol protocol = 'http' proxy_handler = urllib2.ProxyHandler({ protocol.lower(): self.proxy.getProxyString() }) opener = urllib2.build_opener(proxy_handler) response = opener.open(self.url, timeout=5) response.read() success = True except Exception, e: log.debug('Fail on proxy test: ' + str(e) + ' ' +\ self.proxy.getAllString())
def parseProfileHtml(self, html): """Returns: {(UserInfo, ErrorCode)} the user information and errorcode. """ document = BeautifulSoup(html) errorCode = ErrorCode.LACK_CRITICAL_INFO pagetype = -1 info = None if document.find('div', id = 'timeline'): # Timeline profile page log.debug('The page type is: Timeline') pagetype = PageType.TIME_LINE info = self.parseTimelineProfilePage(document) else: # Old profile page if document.find('div', id = 'visitors'): # With access permission log.debug('The page type is: old with access') pagetype = PageType.OLD_WITH_ACCESS info = self.parseOldProfilePageWithAccess(document) elif document.find('div', id = 'allFrdGallery'): # Without access permission log.debug('The page type is: old without access') pagetype = PageType.OLD_WITHOUT_ACCESS info = self.parseOldProfilePageWithoutAccess(document) else: # Unknown page tamplate errorCode = ErrorCode.UNKNOWN_PAGE if info: info.pagetype = pagetype info.html = html return (info, ErrorCode.OK) else: # Parse error return (None, errorCode)
def parseOldProfilePageWithAccess(self, document): """Returns: {UserInfo} the user information.""" info = UserInfo() # Name & visitedNum try: holderNode = document.find('div', class_='status-holder') info.name = holderNode.find('h1', class_='username').string info.visitedNum = int( holderNode.find('span', class_='count') .find('span', class_='count').string) except: return None # Lack of critical information summaryNode = document.find('div', class_='profile-summary') # Gender try: # sayHiText is something like: '向他打招呼' sayHiText = summaryNode.ul.find('a', stats='pf_poke').string if re.compile(u'.*他.*').match(sayHiText): info.gender = 'male' elif re.compile(u'.*她.*').match(sayHiText): info.gender = 'female' except Exception, e: log.debug('Exception in gender: ' + str(e)) pass
except: pass # Hometown try: hometownNode = ul.find('li', class_='hometown') hometownString = '' # TODO: Some times they don't user <a> to palce hometown string for aTag in hometownNode.find_all('a'): hometownString += aTag.string + ' ' if not hometownNode.find_all('a'): # Use string to directly represent hometown pattern = re.compile('\S*\s(.*)') hometownString = pattern.match(hometownNode.string).group(1) info.hometown = hometownString except Exception, e: log.debug('Exception in hometown: ' + str(e)) pass # Residence try: residenceNode = ul.find('li', class_='address') placeString = '' # TODO: Some times they don't user <a> to palce hometown string for aTag in residenceNode.find_all('a'): placeString += aTag.string + ' ' if not residenceNode.find_all('a'): # Use string to directly represent hometown pattern = re.compile('\S*\s(.*)') placeString = pattern.match(residenceNode.string).group(1) info.residence = placeString except Exception, e: log.debug('Exception in Residence: ' + str(e))