Beispiel #1
0
 def get_auto_configured_spider(cls, offset=0):
     cats = dict()
     cats['105'] = '海外綜合'
     cats['102'] = '新聞專輯'
     singtao_seed = {'http://std.stheadline.com/'}
     util.add_hrefs('http://std.stheadline.com/',
                    selectors={'#navbar ul.nav li.has-children > a'},
                    seeds=singtao_seed)
     '''
     id_pattern = re.compile(ur'(?<=php\?cat=)\d+')
     for cat in d('ul.sub-menu a').items():
         if re.findall(id_pattern, cat.attr('href')):
             cats[re.findall(id_pattern, cat.attr('href'))[0]] = cat.text()
         if instant_cat_pattern.match(cat.attr('href')):
             singtao_seed.add(cat.attr('href'))
     for k, v in cats.iteritems():
         singtao_seed.add('http://std.stheadline.com/daily/section-list.php?cat=' + k)
     '''
     spider_singtao = SpiderSingTao(
         'SpiderSingTao',
         singtao_seed, {
             ur'(http://std\.stheadline\.com/daily/news-content\.php.*)|(http://std\.stheadline\.com/instant/articles/detail/\d+.*)'
         },
         THREAD_NUM=1,
         cats=cats)
     spider_singtao.FETCH_DELAY = 0.5
     spider_singtao.BATCH_NUMBER = util.get_day_stamp() + 10060
     spider_singtao.OFFSET = offset
     return spider_singtao
Beispiel #2
0
 def get_auto_configured_spider(cls, offset=0):
     hkcd_seed = {'http://hk.hkcd.com/node_25195.htm'}
     util.add_hrefs(url='http://hk.hkcd.com/node_30602.htm',
                    selectors={'a'},
                    seeds=hkcd_seed,
                    seed_patterns={node_pattern},
                    prefix='http://hk.hkcd.com/')
     hkcd_seed.add('http://www.hkcd.com/')
     hk_cat_dict = _get_hk_cat_dict(hkcd_seed, {content_pattern})
     cat_dict = _get_cat_dict(
         {'http://www.hkcd.com/content/2016-07/18/content_1008717.html'})
     current_day_sting = util.get_day_string(offset=offset)
     day_string = current_day_sting[0:4] + '-' + current_day_sting[
         4:6] + '/' + current_day_sting[6:8]
     hkcd_reg = {
         r'http://(www|hk)\.hkcd\.com/content/' + day_string + '/.*'
     }
     spider_hkcd = SpiderHKCD('SpiderHKCD',
                              hkcd_seed,
                              hkcd_reg,
                              THREAD_NUM=10,
                              MAX_DEPTH=2)
     spider_hkcd._hk_cat_dict = hk_cat_dict
     spider_hkcd._cat_dict = cat_dict
     spider_hkcd.BATCH_NUMBER = util.get_day_stamp() + 10120
     return spider_hkcd
Beispiel #3
0
 def get_auto_configured_spider(cls, offset=0):
     dmhk_seed = {'http://news.dmhk.net/'}
     util.add_hrefs('http://news.dmhk.net/', {'#mega_main_menu_ul a'}, dmhk_seed)
     dmhk_reg = {ur'http://news\.dmhk\.net/?p=\d+'}
     spider_dmhk = SpiderDMHK('SpiderDMHK', dmhk_seed, dmhk_reg, THREAD_NUM=5)
     spider_dmhk.BATCH_NUMBER = util.get_day_stamp() + 10230
     return spider_dmhk
Beispiel #4
0
 def get_auto_configured_spider(cls, offset=0):
     _852_seed = {'http://www.post852.com/'}
     util.add_hrefs('http://www.post852.com/', {'#rightnav a'}, _852_seed)
     spider_852 = Spider852('Spider852',
                            _852_seed, {ur'http://www.post852.com/\d+/.+'},
                            THREAD_NUM=5)
     spider_852.OFFSET = offset
     spider_852.BATCH_NUMBER = util.get_day_stamp() + 10400
     return spider_852
Beispiel #5
0
 def get_auto_configured_spider(cls, offset=0):
     savantas_seed = {'http://www.savantas.org/'}
     util.add_hrefs('http://www.savantas.org/', {'#navigation a'},
                    savantas_seed)
     spider_savantas = SpiderSavantas('SpiderSavantas',
                                      savantas_seed,
                                      {ur'http://www.savantas.org/\?p=\d+'},
                                      THREAD_NUM=5)
     spider_savantas.OFFSET = offset
     spider_savantas.BATCH_NUMBER = util.get_day_stamp() + 10460
     return spider_savantas
Beispiel #6
0
 def get_auto_configured_spider(cls, offset=0):
     bbc_seed = {'http://www.hkcna.hk/'}
     util.add_hrefs('http://www.hkcna.hk/', {'.baner_01 a'}, bbc_seed, news_prefix)
     day_str = util.get_day_string(offset=offset)
     day_str = day_str[:4] + '/' + day_str[4:]
     spider_hkcna = SpiderHKCNA('SpiderHKCNA',
                                bbc_seed,
                                {ur'http://www.hkcna.hk/.+' + day_str + '.+'},
                                THREAD_NUM=5)
     spider_hkcna.OFFSET = offset
     spider_hkcna.BATCH_NUMBER = util.get_day_stamp() + 10370
     return spider_hkcna
Beispiel #7
0
 def get_auto_configured_spider(cls, offset=0):
     now_seed = {'https://news.now.com/home'}
     util.add_hrefs('https://news.now.com/home', {'#navBar a'},
                    now_seed,
                    seed_patterns={re.compile(r'/home/.+')},
                    prefix=prefix)
     spider_now = SpiderNow('SpiderNow',
                            now_seed,
                            {ur'https://news\.now\.com/.+newsId=\d+.+'},
                            THREAD_NUM=10)
     spider_now.BATCH_NUMBER = util.get_day_stamp() + 10280
     spider_now.OFFSET = offset
     return spider_now
Beispiel #8
0
 def get_auto_configured_spider(cls, offset=0):
     finet_seed = {'http://www2.finet.hk/'}
     util.add_hrefs(url='http://www2.finet.hk/',
                    selectors={'#mainmenu2 li a'},
                    seeds=finet_seed)
     finet_reg = {ur'http://www2\.finet\.hk/Newscenter/news_detail/.+'}
     spider_finet = SpiderFinet('SpiderFinet',
                                finet_seed,
                                finet_reg,
                                THREAD_NUM=10)
     spider_finet.BATCH_NUMBER = util.get_day_stamp() + 10250
     spider_finet.OFFSET = offset
     return spider_finet
Beispiel #9
0
 def get_auto_configured_spider(cls, offset=0):
     cablenews_seed = {
         'http://cablenews.i-cable.com/webapps/index/index.php'
     }
     util.add_hrefs('http://cablenews.i-cable.com/webapps/index/index.php',
                    {'#header_web_chi a'}, cablenews_seed)
     cablenews_reg = {ur'http://.+?\.i-cable\.com/.*videopage.*\d+/.*'}
     spider_cablenews = SpiderCableNews('SpiderCableNews',
                                        cablenews_seed,
                                        cablenews_reg,
                                        THREAD_NUM=10)
     spider_cablenews.BATCH_NUMBER = util.get_day_stamp() + 10220
     spider_cablenews.OFFSET = offset
     return spider_cablenews
Beispiel #10
0
 def get_auto_configured_spider(cls, offset=0):
     bbc_seed = set()
     util.add_hrefs('http://www.bbc.com/zhongwen/trad/',
                    {'ul.navigation-wide-list a'}, bbc_seed, news_prefix)
     bbc_seed.add('http://www.bbc.com/zhongwen/trad/hong_kong_review')
     day_str = util.get_day_string(offset=offset)
     day_str = day_str[2:]
     spider_bbc = SpiderBBC('SpiderBBC',
                            bbc_seed,
                            {ur'http://www.bbc.com/.+' + day_str + '.+'},
                            THREAD_NUM=5)
     spider_bbc.OFFSET = offset
     spider_bbc.BATCH_NUMBER = util.get_day_stamp() + 10360
     return spider_bbc
Beispiel #11
0
 def get_auto_configured_spider(cls, offset=0):
     tvb_seed = {'http://news.tvb.com/'}
     util.add_hrefs(
         url='http://news.tvb.com/',
         selectors={'#topMenu a'},
         seeds=tvb_seed,
         seed_patterns={re.compile(r'http://news\.tvb\.com/list/\d+/')})
     spider_tvb = SpiderTVB('SpiderTVB',
                            tvb_seed,
                            {ur'http://news\.tvb\.com/\w+/[\d\w]{10,}'},
                            THREAD_NUM=10,
                            MAX_DEPTH=2)
     spider_tvb.BATCH_NUMBER = util.get_day_stamp() + 10290
     spider_tvb.OFFSET = offset
     return spider_tvb
Beispiel #12
0
 def get_auto_configured_spider(cls, offset=0):
     vj_seed = {'http://www.vjmedia.com.hk/'}
     util.add_hrefs(url='http://www.vjmedia.com.hk/',
                    selectors={'ul.mainnav.dropdown li a'},
                    seeds=vj_seed)
     vj_reg = {
         ur'http://www.vjmedia.com.hk/articles/' +
         util.get_day_string(interval_str='/', offset=offset) + '/.+'
     }
     spider_vj = SpiderVJMedia('SpiderVJMedia',
                               vj_seed,
                               vj_reg,
                               THREAD_NUM=10)
     spider_vj.OFFSET = offset
     spider_vj.BATCH_NUMBER = util.get_day_stamp() + 10180
     return spider_vj
Beispiel #13
0
 def get_auto_configured_spider(cls, offset=0):
     _wsj_seed = {'http://cn.wsj.com/gb/globe.asp'}
     util.add_hrefs('http://cn.wsj.com/gb/globe.asp',
                    {'#navigation a[target=_top]'}, _wsj_seed)
     wsj_seed = set()
     for url in _wsj_seed:
         if incomplete_cat_url_pattern.match(url):
             url = news_prefix + url[2:]
             wsj_seed.add(url)
     day_str = util.get_day_string(offset=offset)
     spider_wsj = SpiderWSJ('SpiderWSJ',
                            wsj_seed,
                            {ur'http://cn.wsj.com/gb.+' + day_str + '.+'},
                            THREAD_NUM=5)
     spider_wsj.OFFSET = offset
     spider_wsj.BATCH_NUMBER = util.get_day_stamp() + 10380
     return spider_wsj
Beispiel #14
0
 def get_auto_configured_spider(cls, offset=0):
     metrofinance_seed = {
         'http://www.metroradio.com.hk/104/',
         'http://www.metroradio.com.hk/news/live.aspx'
     }
     util.add_hrefs('http://www.metroradio.com.hk/104/', {'.toplink2 a'},
                    metrofinance_seed,
                    prefix=prefix)
     day_str = util.get_day_string(offset=offset)
     spider_metrofinance = SpiderMetroFinance(
         'SpiderMetroFinance',
         metrofinance_seed,
         {ur'http://www\.metroradio\.com\.hk/.+' + day_str + '.+'},
         THREAD_NUM=10)
     spider_metrofinance.OFFSET = offset
     spider_metrofinance.BATCH_NUMBER = util.get_day_stamp() + 10410
     return spider_metrofinance
Beispiel #15
0
 def get_auto_configured_spider(cls, offset=0):
     xinhua_seed = {
         'http://www.news.cn/gangao/index.htm',
         'http://www.news.cn/gangao/jsxw.htm'
     }
     util.add_hrefs('http://www.news.cn/gangao/index.htm', {'.nav.domPC a'},
                    xinhua_seed, news_prefix)
     day_str = util.get_day_string('-', offset=offset)
     day_str = day_str[:-3] + '/' + day_str[-2:]
     spider_xinhua = SpiderXinhua(
         'SpiderXinhua',
         xinhua_seed,
         {ur'http://news\.xinhuanet\.com/gangao/' + day_str + '.+'},
         THREAD_NUM=5)
     spider_xinhua.OFFSET = offset
     spider_xinhua.BATCH_NUMBER = util.get_day_stamp() + 10430
     return spider_xinhua
Beispiel #16
0
 def get_auto_configured_spider(cls, offset=0):
     initium_seed = {'https://theinitium.com/'}
     util.add_hrefs(url='https://theinitium.com/',
                    selectors={'div.left-nav-top li a'},
                    seeds=initium_seed,
                    prefix=prefix)
     initium_reg = {
         ur'https://theinitium\.com/article/' +
         util.get_day_string(offset=offset) + '-.+',
         ur'http://feeds\.initium.+'
     }
     spider_initium = SpiderInitium('SpiderInitium',
                                    initium_seed,
                                    initium_reg,
                                    THREAD_NUM=10)
     spider_initium.BATCH_NUMBER = util.get_day_stamp() + 10190
     spider_initium.OFFSET = offset
     return spider_initium
 def get_auto_configured_spider(cls, offset=1):
     day_str = util.get_day_string('.', 'inverse', offset=offset)
     commercial_seed = {
         'http://www.881903.com/Page/ZH-TW/news.aspx?sdate=' + day_str +
         '&csid=261_341'
     }
     util.add_hrefs(url='http://www.881903.com/Page/ZH-TW/news.aspx?' +
                    day_str + '&csid=261_341',
                    seeds=commercial_seed,
                    selectors={'#newsCategoryTab a'},
                    prefix=cat_prefix)
     _seed = copy.deepcopy(commercial_seed)
     for seed in _seed:
         if cat_page_pattern.match(seed):
             r = util.get_safe_response(seed)
             if r:
                 d = pq(r.text)
                 if re.findall(total_page_pattern,
                               d('.Font_Article_CH').text()):
                     total_page = int(
                         re.findall(total_page_pattern,
                                    d('.Font_Article_CH').text())[0])
                     for i in range(total_page):
                         commercial_seed.add(seed + '&page=' + str(i + 1))
     ''''
     r = requests.get('http://www.881903.com/Page/ZH-TW/index.aspx')
     d = pq(r.text)
     for a in d('.header2012 ul li a').items():
         if a.attr('href'):
             u = a.attr('href')
             if not complete_pattern.match(u):
                 if incomplete_pattern.match(u):
                     u = prefix + u
                     commercial_seed.add(u)
     '''
     commercial_reg = {ur'http://www\.881903\.com/.+detail.*'}
     spider_commercial = SpiderCommercialRadio('SpiderCommercialRadio',
                                               commercial_seed,
                                               commercial_reg,
                                               THREAD_NUM=10)
     spider_commercial.BATCH_NUMBER = util.get_day_stamp() + 10260
     spider_commercial.OFFSET = offset
     # spider_commercial.MAX_DEPTH = 5
     return spider_commercial