Ejemplo n.º 1
0
def crawSXMetalComments(link):
    startContext = JTMetalNetSpiderUtils.returnStartContext(link,'<ul class="tab_conbox" id="tab_conbox2">')
    targetContext = JTMetalNetSpiderUtils.filterContextByTarget(startContext,'<div>','</div>')
    currentList = []
    linkUrl = link+JTMetalNetSpiderUtils.filterContextByTarget(targetContext,'<a href="','" title')
    title = JTMetalNetSpiderUtils.filterContextByTarget(targetContext,'<font style="color:red;" >','</font>')
    #currentTime = time.strftime("%Y-%m-%d",time.localtime())
    pubDate = JTMetalNetSpiderUtils.filterContextByTarget(targetContext,'<span>','</span>')
    descriptContext = crawDescriptContext(linkUrl)
    currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'METAL','GTNET'])
    return currentList
Ejemplo n.º 2
0
def writeDailyMetalComments():
    webLink = 'http://www.jiatai9999.com/' 
    linkUrl = 'http://www.jiatai9999.com/list.php?catId=5'   
    resultList = crawSXMetalComments(webLink)
    currentList  = crawDailyMetalComments(linkUrl,webLink)
    resultList = resultList+currentList
    conn = JTMetalNetSpiderUtils.getMySQLConn()
    cursor = conn.cursor()
    try:
        cursor.execute("DELETE  FROM  COMMENTS_METAL_RESOURCE_TABLE  WHERE  SOURCEFLAG = 'GTNET'")
        conn.commit()
    except conn.Error,e:
        print "Mysql Error %d: %s" % (e.args[0], e.args[1])
        conn.rollback()
Ejemplo n.º 3
0
def crawDailyMetalComments(link,webLink):
    currentList = []
    startContext = JTMetalNetSpiderUtils.returnStartContext(link,'<ul id="lie">')
    startContext = JTMetalNetSpiderUtils.filterContextByTarget(startContext,'<ul id="lie">','</ul>')
    i = 0
    while i <7:
        targetContext = JTMetalNetSpiderUtils.divisionTarget(startContext,'<li>','</li>')
        startContext = targetContext['nextContext']
        currentContext =  targetContext['targetContext']
        linkUrl = webLink + JTMetalNetSpiderUtils.filterContextByTarget(currentContext,'<a href="','" title')
        title = JTMetalNetSpiderUtils.filterContextByTarget(currentContext,'title="','">')
        pubDate = JTMetalNetSpiderUtils.filterContextByTarget(currentContext,'<span>','</span>')
        descriptContext = crawDescriptContext(linkUrl)
        currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'METAL','GTNET'])
        i += 1
    return currentList
Ejemplo n.º 4
0
def crawDescriptContext(link):
    startContext = JTMetalNetSpiderUtils.returnStartContext(link,'</strong></p>')
    startContext = JTMetalNetSpiderUtils.filterAfterContext(startContext,'</strong></p>')
    descriptContext = JTMetalNetSpiderUtils.filterContextByTarget(startContext,'<p>','</p>')
    descriptContext = JTMetalNetSpiderUtils.removeSpecialCharacter(descriptContext)
    return descriptContext