Beispiel #1
0
def main():
    try:
        search_for_tudou_video_posts()
    except Exception, e:
        store_error(TUDOU_INFO_SOURCE_ID)
        video_logger.exception(e)
Beispiel #2
0
def search_for_tudou_video_posts():
    previous_real_count = session.query(VideoPost).filter(
        VideoPost.info_source_id == TUDOU_INFO_SOURCE_ID).count()

    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = TUDOU_INFO_SOURCE_ID

    html_parser = HTMLParser.HTMLParser()

    for keyword in KEYWORDS:
        page = 1
        finished = False
        while (not finished and page <= 10):
            url = "http://www.soku.com/t/nisearch/" + urllib.quote_plus(
                keyword.str.encode('utf8')
            ) + '/_cid__sort_date_display_album_time_0_page_' + str(
                page) + '?sfilter=1'
            page = page + 1
            #print url
            headers = {
                'Host':
                'www.soku.com',
                'Referer':
                'http://www.soku.com/search_video/',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
            }

            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req)
            content = response.read()

            soup = BeautifulSoup(content)

            posts = soup.findAll('div', attrs={'class': "v"})
            count = count + len(posts)
            if len(posts) == 0:
                finished = True
                break

            for post in posts:
                try:
                    video_user_screen_name = post.find('span',
                                                       attrs={
                                                           'class': "username"
                                                       }).text
                    deltatime = post.find('span', attrs={'class': "pub"}).text
                    v_meta_title = post.find('div',
                                             attrs={'class': "v-meta-title"})
                    title = v_meta_title.a['title']
                    title = html_parser.unescape(title)
                    url = v_meta_title.a['href']

                    try:
                        v_meta_entry = post.find(
                            'div', attrs={'class': "v-meta-entry"})
                        v_meta_datas = v_meta_entry.findAll(
                            'div', attrs={'class': "v-meta-data"})
                        playcount = v_meta_datas[1].text
                        playcount = playcount[playcount.find(":") + 1:]
                        playcount = playcount.replace(',', '')
                        playcount = int(playcount)
                    except:
                        playcount = 0
                    #对关键词进行重新过滤
                    if not recheck_title(keyword, title):
                        continue

                    try:
                        created_at = convertTime(deltatime)
                        if created_at == -1:
                            continue
                    except:
                        created_at = datetime.now()
                    #print video_user_screen_name,created_at,title,url,playcount
                    store_by_tudou_video_url(url, keyword.id, title,
                                             video_user_screen_name,
                                             created_at, playcount)
                    time.sleep(5)

                except Exception, e:
                    store_error(TUDOU_INFO_SOURCE_ID)
                    video_logger.exception(e)
                    time.sleep(5)
Beispiel #3
0
def main():
    try:
        search_for_youku_video_posts()
    except Exception, e:
        store_error(YOUKU_INFO_SOURCE_ID)
        video_logger.exception(e)
Beispiel #4
0
def main():
    try:
        search_for_sina_video_posts()
    except Exception, e:
        store_error(SINA_VIDEO_INFO_SOURCE_ID)
        video_logger.exception(e)
Beispiel #5
0
def search_for_sina_video_posts():
    previous_real_count = session.query(VideoPost).filter(
        VideoPost.info_source_id == SINA_VIDEO_INFO_SOURCE_ID).count()

    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = SINA_VIDEO_INFO_SOURCE_ID

    for keyword in KEYWORDS:
        page = 1
        finished = False
        while (not finished and page <= 10):
            url = "http://video.sina.com.cn/search/index.php?k=" + urllib.quote_plus(
                keyword.str.encode('utf8')) + "&m1=a&m3=a2&page=" + str(page)
            page = page + 1
            #print url

            headers = {
                'Host':
                'video.sina.com.cn',
                'Referer':
                'http://video.sina.com.cn/search/index.php?',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
            }

            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req)
            content = response.read()

            soup = BeautifulSoup(content)
            video_list = soup.find('div', id="contentH")
            if video_list == None:
                finished = True
                break
            divs = video_list.findAll('div')
            if len(divs) == 0:
                finished = True
                break
            tr_arr = video_list.findAll('tr')
            temp_arr = []

            for i, tr in enumerate(tr_arr):
                try:
                    if i % 2 == 0:
                        temp_arr = []
                        td_divs = tr.findAll('div', attrs={'class': "v_Info"})
                        for j, td_div in enumerate(td_divs):
                            name_div = td_div.find('div',
                                                   attrs={'class': 'name'})
                            a_tag = name_div.findAll('a')[1]
                            video_url = a_tag['href']
                            video_title = a_tag['title']
                            temp_arr.append({
                                'video_url': video_url,
                                'video_title': video_title
                            })
                            #print video_url,video_title
                    else:
                        #print tr.prettify()
                        td_divs = tr.findAll('div', attrs={'class': "v_Info"})
                        for j, td_div in enumerate(td_divs):
                            li_arr = td_div.findAll('li')
                            try:
                                video_user = li_arr[0].a['title']
                            except:
                                video_user = li_arr[0].a.text
                            video_createAt = li_arr[1].text
                            created_at = convertTime(video_createAt)

                            video_url = temp_arr[j]['video_url']
                            video_title = temp_arr[j]['video_title']

                            try:
                                play_count = li_arr[2].text
                                play_count = play_count[3:]
                                play_count = play_count.replace(',', '')
                                play_count = int(play_count)
                            except:
                                play_count = 0
                            #print "###"+video_title
                            #二次过滤关键词和时间
                            if created_at != -1 and recheck_title(
                                    keyword, video_title) == True:
                                #print video_title,video_url,video_user,created_at,play_count
                                store_by_sina_video_url(
                                    video_url, keyword.id, video_title,
                                    video_user, created_at, play_count)

                except Exception, e:
                    store_error(SINA_VIDEO_INFO_SOURCE_ID)
                    video_logger.exception(e)
                    time.sleep(5)

            time.sleep(5)
def main():
    try:
        search_for_youku_global_video_posts()
    except Exception, e:
        store_error(ALL_VIDEO_INFO_SOURCE_ID)
        video_logger.exception(e)