コード例 #1
0
ファイル: xinhuaCrawler.py プロジェクト: aldslvda/crawler
def getNewsList(date):
    lasttime = CurrentTime

    appLogger.info("Crawling Xinhuashe Video")

    rd = RequestData()
    rd.setSouceType('app')
    rd.setMetaID('')
    rd.setTrackSourceID(TRACESOURCEID)
    try:
        url = 'http://pub.zhongguowangshi.com/getRecord?date=' + date
        print url
        newsList = getXinhuaList(url)

    except:
        appLogger.info("fail to crawl page")
        return
    for newsurl in newsList:
        rd, isSuccess = setRdInfo(newsurl, rd)
        if not isSuccess:
            continue

        appLogger.info(rd._clip_title +
                       "is successfully crawled , sending to MQ...")
        if len(rd._source_url) != 0 or len(rd._content) != 0:
            sendToMQ(rd)
        lasttime = rd._publish_time
コード例 #2
0
def getNewsList(channel):
    lasttime = CurrentTime
    print("Crawling Channels ... channelid is " +channel['name'])
    pagenum =0
    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime,TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['name'] ,pagenum 
        try :
            newsList = id2Url(channel,pagenum)
            if newsList==None or len(newsList)<1:
                break
        except:
            appLogger.error(traceback.format_exc())
            print("fail to crawl page")
            break

        for newsitem in newsList:
            time.sleep(0.5)
            #print(newsitem["title"])
            rd,isSuccess= setRdInfo(newsitem,rd)
            if not isSuccess:
                continue

            print(rd._clip_title + "is successfully crawled , sending to MQ...")
            if len(rd._source_url)!=0 or len(rd._content)!=0:
                sendToMQ(rd)

            lasttime = rd._publish_time
        break
コード例 #3
0
def getChannelNewsList(channelid):
    print("Crawling Channels ... channelid is " + channelid)

    lasttime = CurrentTime

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    Listurl = id2Url(ChannelIds[channelid])
    newsLists = json.loads(getList(Listurl))

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists tencentnews (id integer primary key,pid text)"
    )
    #print channelid
    if len(newsLists["idlist"][0]["ids"]) == 0:
        return
    for newsitem in newsLists["idlist"][0]["ids"]:
        if cmp(lasttime, TenDaysAgoTime) == -1:
            break
        if not int(newsitem["exist"]) == 1:
            continue
        cursor.execute("select * from tencentnews where pid='" +
                       str(newsitem["id"]) + "'")
        if len(cursor.fetchall()) > 0:
            #print("Newsitem has been crawled before, pass...")
            continue
        try:
            rdi, rdv, isSuccess = setRdInfo(channelid, newsitem, rd)
        except:
            isSuccess = False
        if not isSuccess:
            continue
        print("Newsitem is successfully crawled , sending to MQ...")
        if len(rdi._source_url) != 0:
            #print rdi.hostparseToStr()
            sendToMQ(rdi)
        if rdv._outer_clipurl != "":
            #print rdv.linkparseToStr()
            sendTolinkMQ(rdv)
        domaindb.execute("insert into tencentnews(pid) values('" +
                         str(newsitem["id"]) + "')")
        domaindb.commit()
        lasttime = rd._publish_time
        #print rd._publish_time +"::::::"+ rd._clip_title
        time.sleep(0.1)
    domaindb.close()
コード例 #4
0
def getNewsList(channelid):
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " + channelid)

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = id2Url(ChannelIds[channelid], pagenum)
        print(listurl)
        time.sleep(1)

        newsLists = getList(listurl)

        if newsLists == None:
            continue
        newsLists = json.loads(newsLists)

        if not newsLists.has_key("articles") or len(
                newsLists["articles"]) == 0:
            break
        for newsitem in newsLists["articles"]:
            if not newsitem.has_key("link") or not newsitem.has_key(
                    "time"
            ) or newsitem["link"] == "" or newsitem["link"][0:4] == "chan":
                continue
            try:
                rd, isSuccess = setRdInfo(newsitem, rd)
            except:
                isSuccess = False
            if not isSuccess:
                continue
            if len(rd._source_url) == 0:
                continue

            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")

            lasttime = rd._publish_time

        pagenum += 1
コード例 #5
0
def getChannelNewsList(channelid):
    print("Crawling Channels ... channelid is " + channelid)

    pagenum = 1
    lasttime = CurrentTime

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    #print channelid
    while cmp(lasttime, TenDaysAgoTime) == 1:
        #print pagenum
        listurl = id2Url(ChannelIds[channelid], pagenum)
        print listurl
        if ChannelIds[channelid] == 'T1457068979049':
            newsLists = getList(listurl)[u'\u89c6\u9891']
        else:
            newsLists = getList(listurl)[ChannelIds[channelid]]
        if len(newsLists) == 0:
            break
        for newsitem in newsLists:
            #print json.dumps(newsitem)
            try:
                if newsitem.has_key('mp4_url') or newsitem.has_key('m3u8_url'):
                    rd = setVinfo(newsitem, rd)
                else:
                    rd = setRdInfo(newsitem, rd)
                lasttime = rd._publish_time

                if not rd:
                    continue
                #print rd._publish_time +":::::"+ rd._clip_title
                print("Newsitem is successfully crawled , sending to MQ...")
                #print rd.hostparseToStr()
                rd_json = rd.hostparseToJson()
                rd_base64 = base64.encodestring(json.dumps(rd_json))
                setTask(rd_base64)
            except:
                appLogger.error(traceback.format_exc())
                print("Newsitem crawling failed")
                continue
        pagenum += 1
コード例 #6
0
ファイル: kanszCrawler.py プロジェクト: aldslvda/crawler
def getNewsList(channel):
    #print channel
    pagenum = 0
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is %s" % channel['name'])

    rd = RequestData()

    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime, TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['name'], pagenum
        clipurl = id2Url(channel, pagenum)
        print clipurl
        try:
            newsLists = json.loads(getList(clipurl))
            newsList = newsLists['list']
            if not newsList:
                break
        except:
            print("fail to crawl page")
            pagenum += 1
            continue

        for newsitem in newsList:
            if not newsitem.has_key("id") or newsitem['id'] == '':
                continue

            print(newsitem["title"])

            rd, rdv, isSuccess = setRdInfo(newsitem, rd)
            if not isSuccess:
                continue

            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            if rd._source_url or rd._content:
                sendToMQ(rd)
            if rdv._source_url or rdv._content:
                sendToMQ(rdv)

            lasttime = rd._publish_time
        pagenum += 1
コード例 #7
0
ファイル: wenhuiCrawler.py プロジェクト: aldslvda/crawler
def getNewsList(channel):
    print channel
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " +channel['name'])
    
    rd = RequestData()

    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime,TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['id'],pagenum
        clipurl = id2Url(channel,pagenum)
        try :
            newsLists = json.loads(getList(clipurl))
            newsList = []
            for i in newsLists['dataList']:
                for j in i['list']:
                    newsList.append(j)
            if newsList==None or len(newsList)<1:
                break
        except:
            appLogger.error(traceback.format_exc())
            break

        for newsitem in newsList:
            rd,isSuccess= setRdInfo(newsitem,rd)        
            if not isSuccess:
                continue

            print(rd._clip_title + "is successfully crawled , sending to MQ...")
            if len(rd._source_url)!=0 or len(rd._content)!=0:

                sendToMQ(rd)

            lasttime = rd._publish_time
        pagenum += 1
コード例 #8
0
ファイル: wenhuiCrawler.py プロジェクト: aldslvda/crawler
def crawler(config):
    '''
    return RequestData obj
    '''
    rd = RequestData()
    return rd
コード例 #9
0
def getVideoList():
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is VideoChannel")

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    print("Crawling VideoChannel ....")

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists sohunews (id integer primary key,pid text)"
    )

    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = id2Url(36, pagenum)

        time.sleep(1)
        newsLists = json.loads(getList(listurl))

        if not newsLists.has_key("articles") or len(
                newsLists["articles"]) == 0:
            break
        for newsitem in newsLists["articles"]:
            if not newsitem.has_key("link") or not newsitem.has_key(
                    "time") or newsitem["link"] == "":
                continue
            cursor.execute("select * from sohunews where pid='" +
                           newsitem["link"] + "'")
            if len(cursor.fetchall()) > 0:
                #print("Newsitem has been crawled before, pass...")
                pass
            mid = getVideoMid(newsitem["link"])
            clipurl = Videodetail(mid)
            #print clipurl
            newsdetail = getList(clipurl)
            if newsdetail == None:
                continue
            rd.setClipUrl(clipurl)
            rd.setClipTitle(newsitem["title"])
            rd.setCategory('video')
            rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000))
            rd.setViewCount(int(newsitem["commentNum"]))
            if newsitem.has_key("media"):
                rd.setClipSource(newsitem["media"])
            newsdetail = json.loads(newsdetail)

            srcurl = []
            for item in newsdetail["message"]["playurl"]:
                if newsdetail["message"]["playurl"][item]!="" and \
                   newsdetail["message"]["playurl"][item]!=0  and \
                   newsdetail["message"]["playurl"][item]!=[]  :
                    srcurl.append(newsdetail["message"]["playurl"][item])
            rd.setSourceUrl(srcurl)
            if len(srcurl) == 0:
                continue
            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")
            domaindb.execute("insert into sohunews(pid) values('" +
                             newsitem["link"] + "')")
            domaindb.commit()
            lasttime = rd._publish_time
            #print type(rd._view_count)
        pagenum += 1
    domaindb.close()
コード例 #10
0
def getAlbumList(channelid):
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " + channelid)

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists sohunews (id integer primary key,pid text)"
    )
    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = AlbumUrl(ChannelIds[channelid], pagenum)

        time.sleep(1)
        newsLists = json.loads(getList(listurl))

        if not newsLists.has_key("news") or len(newsLists["news"]) == 0:
            break
        for newsitem in newsLists["news"]:
            if not newsitem.has_key("gid"):
                continue
            cursor.execute("select * from sohunews where pid='" +
                           str(newsitem["gid"]) + "'")
            if len(cursor.fetchall()) > 0:
                #print("Newsitem has been crawled before, pass...")
                continue
            clipurl = Albumdetail(str(newsitem["gid"]))
            newsdetail = getList(clipurl)
            if newsdetail == None:
                continue
            rd.setClipUrl(clipurl)
            rd.setClipTitle(newsitem["title"])
            rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000))
            rd.setViewCount(int(newsitem["commentNum"]))
            rd.setCategory('image')

            newsdetail = xmltodict.parse(newsdetail)

            srcurl = []
            new_srcurl = []
            if newsdetail["root"].has_key("gallery"):
                if type(newsdetail["root"]["gallery"]
                        ["photo"]) is types.ListType:
                    for img in newsdetail["root"]["gallery"]["photo"]:

                        srcurl.append(img["pic"])
                else:
                    srcurl.append(
                        newsdetail["root"]["gallery"]["photo"]["pic"])

            if len(srcurl) == 0:
                continue

            #FIX https://seals.vobile.cn/trac/ProjectManagement/ticket/743
            for url in srcurl:
                if url.find(',http') > 0:
                    new_srcurl.append(url[:url.find(',http')])
                else:
                    new_srcurl.append(url)

            rd.setSourceUrl(new_srcurl)
            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")
            domaindb.execute("insert into sohunews(pid) values('" +
                             str(newsitem["gid"]) + "')")
            domaindb.commit()
            lasttime = rd._publish_time

        pagenum += 1
    domaindb.close()
コード例 #11
0
def load_wechat_page(url):
    try:
        driver = get_wechat_phantomjs_driver()
        driver.get(url)
        page_source = driver.page_source

        soup = BeautifulSoup(page_source, "lxml")
        logo_url = soup.find('a', attrs={'uigs': "account_name_0"})
        if not logo_url:
            raise Exception("WeChat User Not Found")
        else:
            i = logo_url['href']
        print i
        driver1 = get_wechat_phantomjs_driver()
        driver1.get(i)
        time.sleep(5)
        page_source = driver1.page_source
        soup1 = BeautifulSoup(page_source, "lxml")
        jsonstr = ''
        for i in soup1.find_all('script', attrs={'type': 'text/javascript'}):
            if i.string and 'document.domain' in i.string:
                pattern = re.compile(r"var msgList = ([^']+)]};")
                for j in pattern.findall(i.string):
                    jsonstr = j + ']}'
                    break
        news = []
        while jsonstr == '':
            #print( soup1
            print("need input auth code!")
            driver1.get_screenshot_as_file('./vcode1.jpg')
            cut_image()
            elem_code = driver1.find_element_by_id('input')
            elem_code.send_keys(get_vcode())

            driver1.find_element_by_id('bt').click()
            for _ in xrange(40):
                time.sleep(0.5)
            driver1.get_screenshot_as_file('./vcode2.jpg')
            for i in soup1.find_all('script',
                                    attrs={'type': 'text/javascript'}):
                if i.string and 'document.domain' in i.string:
                    pattern = re.compile(r"var msgList = ([^']+)]};")
                    for j in pattern.findall(i.string):
                        jsonstr = j + ']}'
                        break
        #print( driver1.page_source
        open('./wechatjson', 'w+').write(jsonstr)
        for i in json.loads(jsonstr)['list']:
            news.append(i["app_msg_ext_info"])
            for j in i["app_msg_ext_info"]["multi_app_msg_item_list"]:
                news.append(j)
        success = True
        img_news_list = []
        video_news_list = []
        for news_item in news:
            rd1 = RequestData()
            rd1.setSouceType('wechat')
            rd1.setMetaID('')
            rd = rd1.copyrd()
            rd, rdv, isSuccess = gen_wechat_news(news_item, rd)
            if not isSuccess:
                continue
            print json.dumps(rd)
            print json.dumps(rdv)
            img_news_list.append(rd)
            video_news_list.append(rdv)
        with open('wechat.out', 'w+') as f:
            json.dump(
                {
                    "status": 0,
                    "data": {
                        "img_news": img_news_list,
                        "video_news": video_news_list
                    }
                }, f)
    except:
        time.sleep(20)
        with open('wechat.out', 'w+') as f:
            json.dump({
                "status": -1,
                "data": {
                    "error": traceback.format_exc()
                }
            }, f)
    finally:
        try:
            driver.service.process.send_signal(signal.SIGTERM)
            driver.quit()
            driver1.service.process.send_signal(signal.SIGTERM)
            driver1.quit()
        except:
            pass
    return success, news