def crawDailyThemeNews(link):
    currentList = []
    startContext = StcnThemeNewsSpiderUtils.returnStartContext(link,'<ul class="mainlist" id="mainlist">')
    startContext = StcnThemeNewsSpiderUtils.filterContextByTarget(startContext,'<ul class="mainlist" id="mainlist">','</ul>')
    len = StcnThemeNewsSpiderUtils.findAllTarget(startContext,'<li>')
    for i in range(len):
        targetContext = StcnThemeNewsSpiderUtils.divisionTarget(startContext, '<li>', '</li>')
        startContext = targetContext['nextContext']
        currentcontext =  targetContext['targetContext']
        linkUrl = StcnThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'href="','.shtml')+'.shtml'
        title = StcnThemeNewsSpiderUtils.filterContextByTarget(currentcontext, '.shtml">','</a>')
        keyid = str(uuid.uuid1())
        pubDate = StcnThemeNewsSpiderUtils.filterAfterContext(currentcontext,'</span>')
        pubDate = StcnThemeNewsSpiderUtils.filterContextByTarget(pubDate,'<span>','</span>')
        currentTime = time.strftime("%Y-%m-%d",time.localtime())
        if pubDate[:10]!=currentTime:
            break
        if title != '':
            currentList.append([keyid,linkUrl,pubDate,title,'STCNNET'])
    
    return currentList
Example #2
0
def crawDailyThemeNews(link):
    currentList = []
    startContext = StcnThemeNewsSpiderUtils.returnStartContext(link,'<ul class="mainlist" id="mainlist">')
    startContext = StcnThemeNewsSpiderUtils.filterContextByTarget(startContext,'<ul class="mainlist" id="mainlist">','</ul>')
    len = StcnThemeNewsSpiderUtils.findAllTarget(startContext,'<li>')
    for i in range(len):
        targetContext = StcnThemeNewsSpiderUtils.divisionTarget(startContext, '<li>', '</li>')
        startContext = targetContext['nextContext']
        currentcontext =  targetContext['targetContext']
        linkUrl = StcnThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'href="','.shtml')+'.shtml'
        title = StcnThemeNewsSpiderUtils.filterContextByTarget(currentcontext, '.shtml">','</a>')
        keyid = str(uuid.uuid1())
        pubDate = StcnThemeNewsSpiderUtils.filterAfterContext(currentcontext,'</span>')
        pubDate = StcnThemeNewsSpiderUtils.filterContextByTarget(pubDate,'<span>','</span>')
        currentTime = time.strftime("%Y-%m-%d",time.localtime())
        if pubDate[:10]!=currentTime:
            break
        if title != '':
            currentList.append([keyid,linkUrl,pubDate,title,'STCNNET'])
    
    return currentList