def __init__(self, threads, smtpSession, smtpLock, feedItem): '''Init ifanr info.''' print '''Init ifanr info.''' BaseSpider.__init__(self, threads, smtpSession, smtpLock, feedItem) self.pattern1 = r'<a rel="external" href="http://www.ifanr.com/(\d+?)" title=".+?">.*?<span itemprop="headline">(.+?)</span>.*?</a>' self.pattern2 = r'<div itemprop="articleBody">()(.+?)</div>' self.urlPrefix = '/'
def __init__(self, threads, smtpSession, smtpLock, feedItem): '''Init cnbeta info.''' print '''Init cnbeta info.''' BaseSpider.__init__(self, threads, smtpSession, smtpLock, feedItem) self.pattern1 = r'<div class="title">.+?<a target="_blank" href="/articles/(\d+?).htm">(.+?)</a>' self.pattern2 = r'<div class="introduction">.+?<p>(.+?)</p>.+?<div class="content">(.+?)(<iframe class="tigerstock"|<div class="clear")' self.urlPrefix = '/articles/' self.urlSuffix = '.htm'
def __init__(self, threads, smtpSession, smtpLock, feedItem): '''Init 36kr info.''' print '''Init 36kr info.''' BaseSpider.__init__(self, threads, smtpSession, smtpLock, feedItem) self.pattern1 = r'<a class="title info_flow_news_title".+?href="/p/(\d+?).html" target="_blank">(.+?)</a>' self.pattern2 = r'<h1 class="single-post__title">(.+?)</h1>.*?<section class="article".+?>(.+?)</section>' self.urlPrefix = '/p/' self.urlSuffix = '.html'
def __init__(self): BaseSpider.__init__(self) self.login_status = True
def __init__(self): BaseSpider.__init__(self) self.login_url = 'http://acm.hust.edu.cn/vjudge/user/login.action'
def __init__(self): BaseSpider.__init__(self) self.login_url = 'http://poj.org/login'
def __init__(self): BaseSpider.__init__(self) self.login_url = 'http://acm.bnu.edu.cn/v3/ajax/login.php'
def __init__(self): BaseSpider.__init__(self) self.login_url = 'http://acm.zju.edu.cn/onlinejudge/login.do' self.status_url = ''
def __init__(self, idList=[], threadId=0): BaseSpider.__init__(self, idList, threadId)
def __init__(self): BaseSpider.__init__(self)
def __init__(self): BaseSpider.__init__(self) self.login_url = 'http://acm.hdu.edu.cn/userloginex.php?action=login'
BaseSpider.__init__(self, idList, threadId) def genUrl(self, id): endTime = str(int(round(time.time() * 1000))) url = 'http://pdfm2.eastmoney.com/EM_UBG_PDTI_Fast/api/js?id=$ID$1&TYPE=' if ('3' == str(id)[0]) or ('0' == str(id)[0]): url = 'http://pdfm2.eastmoney.com/EM_UBG_PDTI_Fast/api/js?id=$ID$2&TYPE=' url += 'wk&js=$FUN$((x))&rtntype=4&extend=kdj&check=kte&authorityType=fa&$FUN$=$FUN$' url = url.replace('$ID$', id).replace('$FUN$', 'fsDataTeacma' + endTime) print url return url ''' http://pdfm.eastmoney.com/EM_UBG_PDTI_Fast/api/js?token=4f1862fc3b5e77c150a2b985b12db0fd&rtntype=6&id=0000022&type=wk&authorityType=fa&cb=jsonp1539234497088 http://pdfm2.eastmoney.com/EM_UBG_PDTI_Fast/api/js?id=0000022&TYPE=k&js=fsDataTeacma((x))&rtntype=4&extend=kdj&check=kte&authorityType=fa&fsDataTeacma=fsDataTeacma ''' if __name__ == '__main__': threads = 50 # 线程数(不能少于任务数) idList = BaseSpider.getIdList() KdjWkSpider().initDir() step = len(idList) / threads # total > threads for threadId in xrange(1, threads + 1): subIdList = idList[((threadId - 1) * step):(threadId * step)] spider = KdjWkSpider(subIdList, threadId) spider.start()
def __init__(self): BaseSpider.__init__(self) self.login_url = 'http://uva.onlinejudge.org/index.php?option=com_comprofiler&task=login'