def writeDailyStockComments(): link = 'http://stock.huagu.com/hgsd/' webNet = 'http://stock.huagu.com' currentList = crawDailyStockComments(link,webNet) conn = HGStockNetSpiderUtils.getMySQLConn() cursor = conn.cursor() try: cursor.execute("DELETE FROM COMMENTS_STOCK_RESOURCE_TABLE WHERE SOURCEFLAG = 'HGNET'") conn.commit() except conn.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) conn.rollback()
def crawDailyStockComments(link,webNet): currentList = [] startContext = HGStockNetSpiderUtils.returnStartContext(link,'<ul class="ul-news-list">') startContext = HGStockNetSpiderUtils.filterContextByTarget(startContext,'<ul','</ul>') len = HGStockNetSpiderUtils.findAllTarget(startContext,'<li>') for i in range(len): targetContext = HGStockNetSpiderUtils.divisionTarget(startContext,'<li>','</li>') startContext = targetContext['nextContext'] currentContext = targetContext['targetContext'] linkUrl = webNet + HGStockNetSpiderUtils.filterContextByTarget(currentContext,'<a href="','">') title = HGStockNetSpiderUtils.filterContextByTarget(currentContext,'">','</a>') currentYear = str(time.strftime('%Y',time.localtime(time.time())))+'-' pubDate = currentYear + HGStockNetSpiderUtils.filterContextByTarget(currentContext,'[',']') descriptContext = crawDailyDescriptContext(linkUrl) currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'STOCK','HGNET']) return currentList
def crawDailyDescriptContext(linkUrl): startContext = HGStockNetSpiderUtils.returnStartContext(linkUrl,'<div class="article_con" id="div-article-content">') filterContext = HGStockNetSpiderUtils.filterContextByTarget(startContext,'<p>','</p>') filterContext = HGStockNetSpiderUtils.removeSpecialCharacter(filterContext) return filterContext