Exemple #1
0
def setRdInfo(newsitem,rd):
    try:
        print("Setting up Requestdata ... NewsTitle is " + newsitem["title"])
        detail= detailUrl(newsitem)
        print detail
        rd.setClipUrl(detail['shareUrl'])
        rd.setClipTitle(newsitem["title"])
        pubtime = newsitem['releaseDate']
        rd.setPublishTime(pubtime)
        imagesrc = []
        rd.setCategory('image')
        imglist = detail['images']
        for img in imglist:
            imagesrc.append(img['imageUrl'])
        rd.setSourceUrl(imagesrc)
        content = ''
        text = BeautifulSoup(detail['html']).find_all('p')
        for t in text:
            if t.text!=None:
                content+=t.text
        content = content.replace('\r','').replace('\n','').replace('\t','').replace('"','“')
        rd.setContent(content)
        if len(content)>0:
            f = file('./newstext/'+getMd5(rd._clip_title)+'.txt','w+')
            f.write(content)
            f.close()
        print '====*'+rd._clip_title+'*===='
        print json.dumps(rd.hostparseToJson())
        return rd,True
    except:
        appLogger.error(traceback.format_exc())
        print('crawl page failed')
        return rd,False
Exemple #2
0
def setRdInfo(newsitem,rd):
    try:
        print("Setting up Requestdata ... NewsTitle is " + newsitem["title"])
        detailurl = detailUrl(newsitem)
        print detailurl
        rd.setClipUrl(detailurl)
        rd.setClipTitle(newsitem["title"])
        pubtime = newsitem['date']
        rd.setPublishTime(pubtime)
        imagesrc = []
        rd.setCategory('image')
        soup = BeautifulSoup(getList(detailurl))
        text = soup.find_all("p")
        imglist = soup.find_all("img")
        for img in imglist:
            if img.has_attr('src') and img['src'] != '':
                imagesrc.append(img['src'].replace('\\"', ''))
        rd.setSourceUrl(imagesrc)
        content = ''
        for t in text:
            if t.string:
                content += t.string
        content = content.replace('\r', '').replace('\n', '').replace('\t', '').replace('"', '“')
        rd.setContent(content)
        print '====*'+rd._clip_title+'*===='
        print json.dumps(rd.hostparseToJson())
        return rd, True
    except KeyError:
        appLogger.warning(traceback.format_exc())
        return rd, False
    except:
        appLogger.error(traceback.format_exc())
        return rd, False
Exemple #3
0
def setRdInfo(newsurl, rd):
    try:
        detailurl = newsurl
        print detailurl
        time.sleep(5)
        detail = getList(detailurl)
        soup = BeautifulSoup(detail, 'lxml')
        rd.setClipUrl(detailurl)
        title = soup.find('title')
        appLogger.info("Setting up Requestdata ... NewsTitle is " + title.text)
        rd.setClipTitle(title.text)
        info = soup.find('div', attrs={
            'class': 'xinhua-info'
        }).text.split('2017')
        print info
        rd.setPublishTime('2017' + info[1])
        rd.setClipSource(info[0])
        videosrc = []
        rd.setCategory('video')
        vlist = soup.find_all('video')
        for i in vlist:
            videosrc.append(i["src"])
        rd.setSourceUrl(videosrc)
        print json.dumps(rd.hostparseToJson())
        return rd, True
    except:
        appLogger.error(traceback.format_exc())
        appLogger.info('crawl page failed')
        return rd, False
Exemple #4
0
def getNewsList(channel):
    lasttime = CurrentTime
    print("Crawling Channels ... channelid is " +channel['name'])
    pagenum =0
    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime,TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['name'] ,pagenum 
        try :
            newsList = id2Url(channel,pagenum)
            if newsList==None or len(newsList)<1:
                break
        except:
            appLogger.error(traceback.format_exc())
            print("fail to crawl page")
            break

        for newsitem in newsList:
            time.sleep(0.5)
            #print(newsitem["title"])
            rd,isSuccess= setRdInfo(newsitem,rd)
            if not isSuccess:
                continue

            print(rd._clip_title + "is successfully crawled , sending to MQ...")
            if len(rd._source_url)!=0 or len(rd._content)!=0:
                sendToMQ(rd)

            lasttime = rd._publish_time
        break
Exemple #5
0
def main():
    global wechatConf, wechatCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime
    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")

    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()

    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()

    tracksource = appConf.getTrackSource()
    tracksourceHost = tracksource.getHost()
    tracksourcePort = tracksource.getPort()
    #2. load wechat config
    try:
        Channels = getWechatChannels(tracksourceHost, tracksourcePort)
    except:
        appLogger.error(traceback.format_exc())
        wechatConf = loadConfig(configFile)
        Channels = wechatConf['channels']
    wechatCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(),
                                         ampqer.getRoutingKey(),
                                         ampqer.getHostQueue())
    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")
    #start crawler
    appLogger.info("Start wechatCrawler ...")
    print len(Channels)
    start = time.time()

    channel_list = '太行日报 新华视点 淮北日报 新华视界 新华国际 微黔江 天津日报 人民日报 上海观察 深圳特区报'
    for channel in Channels:
        if channel['name'] in channel_list:
            #print channel['name']
            getNewsList(channel)
            time.sleep(300)
    end = time.time()
    #crawl timeline
    print end - start
Exemple #6
0
def wechatNewsList(channel):
    url = 'http://weixin.sogou.com/weixin?type=1&query=' + channel[
        "sign"] + '&ie=utf8&_sug_=y&_sug_type_='
    phantomjs_cmd = 'python bin/load_phantomjs.py wechat load_page "%s"' % (
        url, )
    p = subprocess.Popen(phantomjs_cmd, shell=True)
    p.communicate()
    success, news = False, None
    res = json.load(open('wechat.out', 'r+'))
    #print res
    if not res['status']:
        success = True
        img_news = res['data']['img_news']
        video_news = res['data']['video_news']
    else:
        appLogger.error(res['data']['error'])
    return success, img_news, video_news
Exemple #7
0
def weiboNewsList(channel):
    url = 'http://www.weibo.com/' + channel[
        'sign'] + '?profile_ftype=1&is_all=1#_0'
    success, news = False, None
    phantomjs_cmd = 'python bin/load_phantomjs.py weibo load_page "%s"' % (
        url, )
    p = subprocess.Popen(phantomjs_cmd, shell=True)
    p.communicate()
    #time.sleep(300)
    res = json.load(open('weibo.out', 'r+'))
    #print res
    if not res['status']:
        success = True
        news = res['data']['news']
    else:
        appLogger.error(res['data']['error'])
    return success, news
Exemple #8
0
def getNewsList(channel):
    appLogger.info("Crawling Channels ... channelid is " + channel['name'])

    time.sleep(10)
    print channel['sign'], channel['name'].encode('utf-8')
    try:
        isSuccess, newsList = weiboNewsList(channel)
        if not isSuccess or (newsList is None or len(newsList) < 1):
            return
    except:
        appLogger.error(traceback.format_exc())
        appLogger.warn("fail to crawl " + channel['name'] + " main page")
        return
    for rd in newsList:
        rd['trackSourceId'] = int(channel['id'])
        appLogger.info(rd['clipTitle'] +
                       "is successfully crawled , sending to MQ...")
        if len(rd['sourceUrl']) != 0 or len(rd['content']) != 0:
            sendToMQ(rd)
Exemple #9
0
def getChannelNewsList(channelid):
    print("Crawling Channels ... channelid is " + channelid)

    pagenum = 1
    lasttime = CurrentTime

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    #print channelid
    while cmp(lasttime, TenDaysAgoTime) == 1:
        #print pagenum
        listurl = id2Url(ChannelIds[channelid], pagenum)
        print listurl
        if ChannelIds[channelid] == 'T1457068979049':
            newsLists = getList(listurl)[u'\u89c6\u9891']
        else:
            newsLists = getList(listurl)[ChannelIds[channelid]]
        if len(newsLists) == 0:
            break
        for newsitem in newsLists:
            #print json.dumps(newsitem)
            try:
                if newsitem.has_key('mp4_url') or newsitem.has_key('m3u8_url'):
                    rd = setVinfo(newsitem, rd)
                else:
                    rd = setRdInfo(newsitem, rd)
                lasttime = rd._publish_time

                if not rd:
                    continue
                #print rd._publish_time +":::::"+ rd._clip_title
                print("Newsitem is successfully crawled , sending to MQ...")
                #print rd.hostparseToStr()
                rd_json = rd.hostparseToJson()
                rd_base64 = base64.encodestring(json.dumps(rd_json))
                setTask(rd_base64)
            except:
                appLogger.error(traceback.format_exc())
                print("Newsitem crawling failed")
                continue
        pagenum += 1
Exemple #10
0
def getNewsList(channel):
    print channel
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " +channel['name'])
    
    rd = RequestData()

    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime,TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['id'],pagenum
        clipurl = id2Url(channel,pagenum)
        try :
            newsLists = json.loads(getList(clipurl))
            newsList = []
            for i in newsLists['dataList']:
                for j in i['list']:
                    newsList.append(j)
            if newsList==None or len(newsList)<1:
                break
        except:
            appLogger.error(traceback.format_exc())
            break

        for newsitem in newsList:
            rd,isSuccess= setRdInfo(newsitem,rd)        
            if not isSuccess:
                continue

            print(rd._clip_title + "is successfully crawled , sending to MQ...")
            if len(rd._source_url)!=0 or len(rd._content)!=0:

                sendToMQ(rd)

            lasttime = rd._publish_time
        pagenum += 1
Exemple #11
0
def setRdInfo(newsitem, rd):
    try:
        #print json.dumps(newsitem)
        print("Setting up Requestdata ... NewsTitle is " + newsitem["title"])
        clip_url = detail_url(newsitem['id'])
        detail = json.loads(getList(clip_url))
        rd.setClipTitle(newsitem["title"])
        pubtime = newsitem['publish_time']
        rd.setPublishTime(pubtime)
        imagesrc = []
        rd.setCategory('image')
        if newsitem.has_key("childs_data"):
            imglist = newsitem["childs_data"]
            for img in imglist:
                imagesrc.append(img['host'] + img['dir'] + img['filepath'] +
                                img['filename'])
        rd.setSourceUrl(imagesrc)
        rd.setClipUrl(newsitem['content_url'])
        soup = BeautifulSoup(detail['content'])
        text = soup.find_all("p")
        content = ''
        for t in text:
            if t.string:
                content += t.string
        content = content.replace('\r', '').replace('\n', '').replace(
            '\t', '').replace('"', '“')
        rd.setContent(content)
        rdv = rd.copyrd()
        rdv.setCategory('video')
        vsrc = []
        if detail.has_key('content_video_url'):
            vsrc.append(detail['content_video_url'])
        rdv.setSourceUrl(vsrc)
        rd.hostparseToJson()
        rdv.hostparseToJson()
        return rd, rdv, True
    except:
        appLogger.error(traceback.format_exc())
        print('crawl page failed')
        return rd, rd, False
Exemple #12
0
def setRdInfo(newsitem, rd):
    try:
        print("Setting up Requestdata ... NewsTitle is " + newsitem["title"])
        detailurl = detailUrl(newsitem)
        print detailurl
        rd.setClipUrl(detailurl)
        rd.setClipTitle(newsitem["title"])
        pubtime = DateFormat(int(newsitem['ts']) / 1000)
        rd.setPublishTime(pubtime)
        rd.setClipSource(newsitem['site'])
        imagesrc = []
        rd.setCategory('image')
        soup = BeautifulSoup(getList(detailurl))
        text = soup.find_all("p")
        imglist = soup.find_all("img")
        for img in imglist:
            if img.has_attr('src') and img['src']:
                imagesrc.append(img['src'].replace('\\"', ''))
            if img.has_attr('data-src') and img['data-src']:
                imagesrc.append(img['data-src'].replace('\\"', ''))
        rd.setSourceUrl(imagesrc)
        content = ''
        for t in text:
            if t.string:
                content += t.string
        content = content.replace('\r', '').replace('\n', '').replace(
            '\t', '').replace('"', '“')
        rd.setContent(content)
        if content:
            f = file('./newstext/' + getMd5(rd._clip_title) + '.txt', 'w+')
            f.write(content)
            f.close()
        print '====*' + rd._clip_title + '*===='
        print json.dumps(rd.hostparseToJson())
        return rd, True
    except:
        appLogger.error(traceback.format_exc())
        print('crawl page failed')
        return rd, False
Exemple #13
0
def setRdInfo(newsitem,rd):
    try:
        detailurl = newsitem
        print detailurl
        soup = BeautifulSoup(getList(detailurl))
        info = soup.find('section',attrs={'class':'dbt'})
        contentsoup = soup.find('div',attrs={'class':'wen'})
        title = info.h1.text
        pubtime = info.h2.span.i.text
        print("Setting up Requestdata ... NewsTitle is " + title)
        rd.setClipUrl(detailurl)
        rd.setClipTitle(title)
        rd.setPublishTime(pubtime)
        imagesrc = []
        rd.setCategory('image')
        text = contentsoup.find_all("p")
        imglist = contentsoup.find_all("img")
        for img in imglist:
            if img['src']!='':
                imagesrc.append('http://news.2500sz.com/'+img['src'].replace('\\"',''))
        rd.setSourceUrl(imagesrc)
        content = ''
        for t in text:
            if t.text!=None:
                content+=t.text
        content = content.replace('\r','').replace('\n','').replace('\t','').replace('"','“')
        rd.setContent(content)
        if len(content)>0:
            f = file('./newstext/'+getMd5(rd._clip_title)+'.txt','w+')
            f.write(content)
            f.close()
        print json.dumps(rd.hostparseToJson())
        return rd,True
    except:
        appLogger.error(traceback.format_exc())
        print('crawl page failed')
        return rd,False
Exemple #14
0
     
    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()


    #2. load wenhui config
    wenhuiConf = loadConfig(configFile)
    Channels = getChannels()
    #wenhuiCrawlerQueue = appCrawlerQueue (wenhuiConf["amqpurl"],wenhuiConf["request_queue"], wenhuiConf["request_queue"], wenhuiConf["request_queue"])
    wenhuiCrawlerQueue = appCrawlerQueue(
            ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()
            )
   
    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
    TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")
    #start crawler
    #print("Start wenhuiCrawler ...")
    print Channels
    for channel in Channels:
        getNewsList(channel)
    #crawl timeline

if __name__ == '__main__':
    try:
        main()
    except:
        appLogger.error(traceback.format_exc())