Example #1
0
def main_task():

    issue_dict_url = 'http://news.mingpao.com/dat/pns/issuelist.js?819181'
    r = requests.get(issue_dict_url)
    json_issue_dict = json.loads(r.text)

    src.spiders.spider_mingpao.SpiderMingPao.PUT_IN_STORAGE = True
    for i in range(util.get_offset_by_day_date('20040627'),
                   util.get_offset_by_day_date('20010101')):
        day_str = util.get_day_string(offset=i)
        if '1 ' + day_str in json_issue_dict['PNS_WEB_TC']:
            issue_id = json_issue_dict['PNS_WEB_TC']['1 ' + day_str]['E']
            news_list_url = 'http://news.mingpao.com/dat/pns/pns_web_tc/feed1/' + day_str + issue_id + '/content.js'
            mingpao_seed = set()
            r = requests.get(news_list_url)
            if re.findall(r'feed_module_2', r.text):
                news_list_data = news_list_data_pattern.findall(r.text)[0]
                json_obj = json.loads(news_list_data)
                for it in json_obj['rss']['channel']['item']:
                    mingpao_seed.add(
                        'http://news.mingpao.com/dat/pns/pns_web_tc/article1/'
                        + day_str + issue_id.lower() + '/todaycontent_' +
                        str(it['ATTRIBUTES']['NODEID']) + '.js')
            mingpao_reg = {
                ur'http://news\.mingpao\.com/dat/pns/.*' + day_str + '.+'
            }
            spider_mingpao = src.spiders.spider_mingpao.SpiderMingPao(
                'SpiderMingPao', mingpao_seed, mingpao_reg, THREAD_NUM=5)
            spider_mingpao.OFFSET = i
            spider_mingpao.logger_file = spider_mingpao.get_file_logger(
                'mingpao_task_log', 'logs/mingpao_task.log')
            spider_mingpao.BATCH_NUMBER = util.get_day_stamp(i) + 10570
            spider_mingpao.start()
        else:
            print 'KEY ERROR: ' + '"1 ' + day_str + '"'
Example #2
0
def main_task():
    govinfo.SpiderGovInfoNews.PUT_IN_STORAGE = True
    for i in range(util.get_offset_by_day_date('20160929'), util.get_offset_by_day_date('19980331')):
        day_str = util.get_day_string(offset=i)
        day_str = day_str[:-2] + '/' + day_str[-2:]
        govinfo_seed = {'http://www.info.gov.hk/gia/general/' + day_str + 'c.htm'}
        govinfo_reg = {ur'http://www\.info\.gov\.hk/gia/general/' + day_str + '.+'}
        spider_govinfo = govinfo.SpiderGovInfoNews('SpiderGovInfoNews', govinfo_seed, govinfo_reg, THREAD_NUM=10)
        spider_govinfo.OFFSET = i
        spider_govinfo.logger_file = spider_govinfo.get_file_logger('govinfo_task_log', 'logs/govinfo_task.log')
        spider_govinfo.BATCH_NUMBER = util.get_day_stamp(i) + 10600
        spider_govinfo.start()
Example #3
0
def main_task():
    now_news.SpiderNow.PUT_IN_STORAGE = True
    for i in range(util.get_offset_by_day_date('20151111'), util.get_offset_by_day_date('20110430')):
        day_str = util.get_day_string(interval_str='-', offset=i)
        now_seed = {'https://news.now.com/home/past?date=' + day_str}
        now_reg = {ur'https://news\.now\.com/.+newsId=\d+.+'}
        spider_now = now_news.SpiderNow('SpiderCableNews',
                                        now_seed,
                                        now_reg,
                                        THREAD_NUM=10)
        spider_now.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10280
        spider_now.OFFSET = i
        spider_now.logger_file = spider_now.get_file_logger('nownews_task_log', 'logs/now_task.log')
        spider_now.start()
Example #4
0
def main_task():
    commercialradio.SpiderCommercialRadio.PUT_IN_STORAGE = True
    commercialradio.SpiderCommercialRadio.CRAWL_NEXT = False
    commercial_reg = {ur'http://www\.881903\.com/.+detail.*'}
    reg_pattern = re.compile(ur'http://www\.881903\.com/.+detail.*')
    for i in range(util.get_offset_by_day_date('20110605'),
                   util.get_offset_by_day_date('20080101')):
        day_str = util.get_day_string(interval_str='-',
                                      style='american',
                                      offset=i)
        portal_url = 'http://www.881903.com/Page/ZH-TW/newssearch.aspx?sdate=' + day_str + '&edate=' + day_str + '&csid=261_0'
        commercial_seed = set()
        try:
            ie_driver = webdriver.Ie(
                'C://Users/benwu/Desktop/IEDriverServer.exe')
            ie_driver.get(portal_url)
            d = pq(ie_driver.page_source)
            add_hrefs(commercial_seed, reg_pattern, ie_driver.page_source)
            if total_page_pattern.findall(d('td.Font_Article_CH').text()):
                total_page = int(
                    total_page_pattern.findall(
                        d('td.Font_Article_CH').text())[0])
                for j in range(2, total_page + 1):
                    # print 'page: ' + str(j)
                    ie_driver.execute_script('StockSearchCallBack(' + str(j) +
                                             ');')
                    load_done = False
                    dd = pq(ie_driver.page_source)
                    while not load_done:
                        dd = pq(ie_driver.page_source)
                        if dd('.Font_Article_CH span'):
                            num = page_num_pattern.findall(
                                dd('.Font_Article_CH span').text())[0]
                            if num == str(j):
                                load_done = True
                    add_hrefs(commercial_seed, reg_pattern,
                              ie_driver.page_source)
                    # print len(commercial_seed)
        finally:
            ie_driver.close()
        spider_commercial = commercialradio.SpiderCommercialRadio(
            'SpiderCommercialRadio',
            commercial_seed,
            commercial_reg,
            THREAD_NUM=10)
        spider_commercial.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10260
        spider_commercial.OFFSET = i
        spider_commercial.logger_file = spider_commercial.get_file_logger(
            'commercial_task_log', 'logs/commercial_task.log')
        spider_commercial.start()
Example #5
0
def main_task():
    rthk.SpiderRTHK.PUT_IN_STORAGE = True
    for i in range(util.get_offset_by_day_date('20160717'), util.get_offset_by_day_date('20150927')):
        current_day_string = util.get_day_string(offset=i)
        day_string = 'archive_year=' + current_day_string[0:4] + '&archive_month=' + current_day_string[
                                                                                     4:6] + '&archive_day=' + current_day_string[
                                                                                                              6:8]
        instant_news_page_url = 'http://news.rthk.hk/rthk/ch/news-archive.htm?' + day_string + '&archive_cat=all'
        rthk_seed = {instant_news_page_url}
        rthk_reg = {ur'http://news\.rthk\.hk/rthk/ch/component/.*' +
                    util.get_day_string(offset=i) +
                    '.*'}
        spider_rthk = rthk.SpiderRTHK('SpiderRTHK', rthk_seed, rthk_reg, THREAD_NUM=5)
        spider_rthk.BATCH_NUMBER = util.get_day_stamp() + 10130
        spider_rthk.OFFSET = i
        spider_rthk.logger_file = spider_rthk.get_file_logger('rthk_task_log', 'logs/rthk_task.log')
        spider_rthk.start()
Example #6
0
def main_task():
    hket.SpiderHKET.PUT_IN_STORAGE = True
    hket_reg = {ur'http://.+\.hket\.com/article/\d+/.*'}
    for i in range(util.get_offset_by_day_date('20161010'), util.get_offset_by_day_date('20161006')):
        day_str = util.get_day_string(offset=i)
        portal_url = 'http://paper.hket.com/srap017/%E6%98%94%E6%97%A5%E6%96%B0%E8%81%9E?dis=' + day_str
        hket_seed = {portal_url}
        spider_hket = hket.SpiderHKET('SpiderHKET',
                                      hket_seed,
                                      hket_reg,
                                      THREAD_NUM=5,
                                      MAX_DEPTH=1)
        spider_hket.BATCH_NUMBER = util.get_day_stamp() + 10110
        spider_hket.OFFSET = i
        spider_hket.logger_file = spider_hket.get_file_logger('hket_task_log',
                                                              'logs/hket_task.log')
        spider_hket.start()
Example #7
0
def main_task():
    src.spiders.spider_apple.SpiderApple.PUT_IN_STORAGE = True
    for i in range(util.get_offset_by_day_date('20070227'),
                   util.get_offset_by_day_date('20020101')):
        day_str = util.get_day_string(offset=i)
        apple_seed = {
            'http://hk.apple.nextmedia.com/archive/index/' + day_str +
            '/index/'
        }
        spider_apple = src.spiders.spider_apple.SpiderApple(
            'SpiderApple',
            apple_seed,
            {ur'http://hk\.apple\.nextmedia\.com/.*' + day_str + '/.*'},
            THREAD_NUM=5)
        spider_apple.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10590
        spider_apple.OFFSET = i
        spider_apple.logger_file = spider_apple.get_file_logger(
            'apple_task_log', 'logs/apple_task.log')
        spider_apple.start()
Example #8
0
def main_task():
    cable_news.SpiderCableNews.PUT_IN_STORAGE = True
    for i in range(util.get_offset_by_day_date('20160724'), util.get_offset_by_day_date('20131231')):
        day_str = util.get_day_string(offset=i)
        first_url = get_news_page_url(day_str, 1)
        r = requests.get(first_url)
        d = pq(r.text)
        if total_page_pattern.findall(d('#1').text()):
            total_page = int(total_page_pattern.findall(d('#1').text())[0])
            cablenews_seed = set()
            for j in range(total_page):
                cablenews_seed.add(get_news_page_url(day_str, j+1))
            cablenews_reg = {ur'http://.+?\.i-cable\.com/.*videopage.*\d+/.*',
                             ur'http://.+?\.i-cable\.com/.*VideoPage.*\d+/.*'}
            spider_cablenews = cable_news.SpiderCableNews('SpiderCableNews',
                                                          cablenews_seed,
                                                          cablenews_reg,
                                                          THREAD_NUM=10)
            spider_cablenews.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10220
            spider_cablenews.OFFSET = i
            spider_cablenews.logger_file = spider_cablenews.get_file_logger('cablenews_task_log', 'logs/cablenews_task.log')
            spider_cablenews.start()