Esempio n. 1
0
 def get_auto_configured_spider(cls, offset=0):
     xinhua_seed = {
         'http://www.news.cn/gangao/index.htm',
         'http://www.news.cn/gangao/jsxw.htm'
     }
     util.add_hrefs('http://www.news.cn/gangao/index.htm', {'.nav.domPC a'},
                    xinhua_seed, news_prefix)
     day_str = util.get_day_string('-', offset=offset)
     day_str = day_str[:-3] + '/' + day_str[-2:]
     spider_xinhua = SpiderXinhua(
         'SpiderXinhua',
         xinhua_seed,
         {ur'http://news\.xinhuanet\.com/gangao/' + day_str + '.+'},
         THREAD_NUM=5)
     spider_xinhua.OFFSET = offset
     spider_xinhua.BATCH_NUMBER = util.get_day_stamp() + 10430
     return spider_xinhua
Esempio n. 2
0
 def get_auto_configured_spider(cls, offset=0):
     # pt_index = pq(requests.get('http://www.passiontimes.hk/4.0/index.php').text)
     # pt_seed = set([])
     # cat_pattern = re.compile(ur'/4.0/category/.*')
     # for cat in pt_index('div.footer-siteMap a').items():
     #     if cat_pattern.match(cat.attr('href').encode('utf-8')):
     #         pt_seed.add('http://www.passiontimes.hk' + str(cat.attr('href')))
     pt_seed = {
         'http://www.passiontimes.hk/4.0/index.php',
         'http://www.passiontimes.hk/4.0/category/3/19',
         'http://www.passiontimes.hk/4.0/category/1/5',
         'http://www.passiontimes.hk/4.0/category/1/4',
         'http://www.passiontimes.hk/4.0/category/1/7',
         'http://www.passiontimes.hk/4.0/category/2/62',
         'http://www.passiontimes.hk/4.0/category/1/1',
         'http://www.passiontimes.hk/4.0/category/1/3',
         'http://www.passiontimes.hk/4.0/category/1/2',
         'http://www.passiontimes.hk/4.0/category/1/6',
         'http://www.passiontimes.hk/4.0/category/2/18',
         'http://www.passiontimes.hk/4.0/category/2/14',
         'http://www.passiontimes.hk/4.0/category/2/15',
         'http://www.passiontimes.hk/4.0/category/2/16',
         'http://www.passiontimes.hk/4.0/category/2/17',
         'http://www.passiontimes.hk/4.0/category/2/10',
         'http://www.passiontimes.hk/4.0/category/2/11',
         'http://www.passiontimes.hk/4.0/category/2/12',
         'http://www.passiontimes.hk/4.0/category/2/13',
         'http://www.passiontimes.hk/4.0/category/2/8',
         'http://www.passiontimes.hk/4.0/category/2/9',
         'http://www.passiontimes.hk/4.0/category/3/125',
         'http://www.passiontimes.hk/4.0/category/1/37',
         'http://www.passiontimes.hk/4.0/category/2/135',
         'http://www.passiontimes.hk/4.0/category/1/124',
         'http://www.passiontimes.hk/4.0/category/3/22',
         'http://www.passiontimes.hk/4.0/category/3/23',
         'http://www.passiontimes.hk/4.0/category/3/20',
         'http://www.passiontimes.hk/4.0/category/3/21',
         'http://www.passiontimes.hk/4.0/category/3/24'
     }
     pt_spider = SpiderPt('PtSpider', pt_seed,
                          {r'http://www\.passiontimes\.hk/article/' + util.get_day_string('-', 'american',
                                                                                          offset=offset) + '.*'},
                          THREAD_NUM=10)
     pt_spider.OFFSET = offset
     pt_spider.BATCH_NUMBER = util.get_day_stamp() + 10001
     return pt_spider
Esempio n. 3
0
 def get_auto_configured_spider(cls, offset=0):
     initium_seed = {'https://theinitium.com/'}
     util.add_hrefs(url='https://theinitium.com/',
                    selectors={'div.left-nav-top li a'},
                    seeds=initium_seed,
                    prefix=prefix)
     initium_reg = {
         ur'https://theinitium\.com/article/' +
         util.get_day_string(offset=offset) + '-.+',
         ur'http://feeds\.initium.+'
     }
     spider_initium = SpiderInitium('SpiderInitium',
                                    initium_seed,
                                    initium_reg,
                                    THREAD_NUM=10)
     spider_initium.BATCH_NUMBER = util.get_day_stamp() + 10190
     spider_initium.OFFSET = offset
     return spider_initium
Esempio n. 4
0
 def get_auto_configured_spider(cls, offset=1):
     day_str = util.get_day_string('.', 'inverse', offset=offset)
     commercial_seed = {
         'http://www.881903.com/Page/ZH-TW/news.aspx?sdate=' + day_str +
         '&csid=261_341'
     }
     util.add_hrefs(url='http://www.881903.com/Page/ZH-TW/news.aspx?' +
                    day_str + '&csid=261_341',
                    seeds=commercial_seed,
                    selectors={'#newsCategoryTab a'},
                    prefix=cat_prefix)
     _seed = copy.deepcopy(commercial_seed)
     for seed in _seed:
         if cat_page_pattern.match(seed):
             r = util.get_safe_response(seed)
             if r:
                 d = pq(r.text)
                 if re.findall(total_page_pattern,
                               d('.Font_Article_CH').text()):
                     total_page = int(
                         re.findall(total_page_pattern,
                                    d('.Font_Article_CH').text())[0])
                     for i in range(total_page):
                         commercial_seed.add(seed + '&page=' + str(i + 1))
     ''''
     r = requests.get('http://www.881903.com/Page/ZH-TW/index.aspx')
     d = pq(r.text)
     for a in d('.header2012 ul li a').items():
         if a.attr('href'):
             u = a.attr('href')
             if not complete_pattern.match(u):
                 if incomplete_pattern.match(u):
                     u = prefix + u
                     commercial_seed.add(u)
     '''
     commercial_reg = {ur'http://www\.881903\.com/.+detail.*'}
     spider_commercial = SpiderCommercialRadio('SpiderCommercialRadio',
                                               commercial_seed,
                                               commercial_reg,
                                               THREAD_NUM=10)
     spider_commercial.BATCH_NUMBER = util.get_day_stamp() + 10260
     spider_commercial.OFFSET = offset
     # spider_commercial.MAX_DEPTH = 5
     return spider_commercial
Esempio n. 5
0
 def get_auto_configured_spider(cls, offset=0):
     current_day_string = util.get_day_string(offset=offset)
     day_string = current_day_string[0:4] + '-' + current_day_string[
         4:6] + '/' + current_day_string[6:8]
     index_prefix = 'http://www.macaodaily.com/html/' + day_string + '/'
     macao_seed = {index_prefix + 'node_2.htm'}
     _index = requests.get(index_prefix + 'node_2.htm')
     d = pq(_index.text)
     for a in d('table.unnamed1 a').items():
         if a.attr('href') is not None:
             macao_seed.add(index_prefix + a.attr('href'))
     spider_macao = SpiderMacao('SpiderMacao',
                                macao_seed, {
                                    ur'http://www\.macaodaily\.com/html/' +
                                    day_string + ur'/content_.*'
                                },
                                THREAD_NUM=10)
     spider_macao.BATCH_NUMBER = util.get_day_stamp() + 10080
     spider_macao.OFFSET = offset
     return spider_macao
Esempio n. 6
0
 def get_auto_configured_spider(cls, offset=0):
     hkfp_seed = {
         'https://www.hongkongfp.com/'
         'https://www.hongkongfp.com/hong-kong-news/',
         'https://www.hongkongfp.com/china-news/',
         'https://www.hongkongfp.com/comment-analysis/',
         'https://www.hongkongfp.com/hkfp-voices/',
         'https://www.hongkongfp.com/category/hkfp-lens/'
     }
     hkfp_reg = {
         ur'https://www.hongkongfp.com/' +
         util.get_day_string(interval_str='/', offset=offset) + u'/.+'
     }
     spider_hkfp = SpiderHKFP('SpiderHKFP',
                              hkfp_seed,
                              hkfp_reg,
                              THREAD_NUM=10,
                              MAX_DEPTH=1)
     spider_hkfp.BATCH_NUMBER = util.get_day_stamp() + 10720
     spider_hkfp.OFFSET = offset
     return spider_hkfp
Esempio n. 7
0
    def normal_item_solver(self, item, task, response):

        with self.charset_lock:
            if self.charset == '':
                self.charset = self.find_charset(response)
        response.encoding = self.charset
        doc = self.get_doc(response)

        if doc:
            title = doc('title').text()
            if title == '':
                if re.findall(r'(?<=<title>).+(?=</title>)', response.text):
                    title = re.findall(r'(?<=<title>).+(?=</title>)',
                                       response.text)[0]
            t = util.get_day_string(offset=self.OFFSET)
            t_stamp = util.get_day_stamp(offset=self.OFFSET) + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
            category = ''
            author = ''
            content = util.get_paragraphs_from_selector(doc, '#pressrelease p')
            if content == '':
                content = util.get_paragraphs_from_selector(
                    doc, '#pressrelease')
            if content == '':
                content = util.get_paragraphs_from_selector(doc, 'td p')
            content = re.sub(ur'*+\n', '', content)

            item.raw = response.text
            item.title = title
            item.t = t
            item.t_stamp = t_stamp
            item.fetched_at = task.fetched_at
            item.category = category
            item.author = author
            item.content = content
            item.url = task.url
            item.source = 'GovInfoNews'
            item.task_no = self.BATCH_NUMBER
Esempio n. 8
0
    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = doc('h1').text()
        t = util.get_day_string(offset=self.OFFSET)
        t_stamp = util.get_now()
        category = ''
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Locpg'
        item.task_no = self.BATCH_NUMBER
Esempio n. 9
0
    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = doc('h1').text()
        t = util.get_day_string(offset=self.OFFSET)
        t_stamp = util.get_timestamp_from_string(t, time_format='%Y%m%d') + time.localtime().tm_hour*3600 + time.localtime().tm_min*60
        category = re.split(r'[:\s]', doc('.ban_t li').text())[1]
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#ozoom p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'People'
        item.task_no = self.BATCH_NUMBER
Esempio n. 10
0
    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, u' \|.*')
        t = util.get_day_string(offset=self.OFFSET)
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime().tm_min * 60
        category = doc('meta[name=subsection]').attr('content')
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#masterContent p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'AppleNews'
        item.task_no = self.BATCH_NUMBER
        if util.within_active_interval(6, 20 * 60):
            _comments = util.get_filtered_facebook_comments_data(
                '367495573302576',
                doc('meta[property="og:url"]').attr('content'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
Esempio n. 11
0
 def get_auto_configured_spider(cls, offset=0):
     hkej_seed = {'http://www2.hkej.com/instantnews', 'http://www.hkej.com/template/landing11/jsp/main.jsp', 'http://www1.hkej.com/dailynews/toc?date='+util.get_day_string('-', offset=offset)}
     # util.add_hrefs('http://www.hkej.com/template/landing11/jsp/main.jsp', {'a'}, hkej_seed, seed_patterns={re.compile(ur'http://www.*hkej\.com/.+')})
     # ** currently the reg of the pages is only for 'instant news'
     hkej_reg = {ur'http://www.*?\.hkej\.com/instantnews.*article/.+', ur'http://www1\.hkej\.com/.*dailynews/.*article/.+'}
     spider_hkej = SpiderHKEJ('SpiderHKEJ', hkej_seed, hkej_reg, THREAD_NUM=10, MAX_DEPTH=1)
     spider_hkej.BATCH_NUMBER = util.get_day_stamp() + 10150
     spider_hkej.OFFSET = offset
     return spider_hkej
Esempio n. 12
0
def main():
    seeds = {
        'http://www.vmo.org/tc/index/page_winterprice/',
        'http://www.afcd.gov.hk/tc_chi/agriculture/agr_fresh/agr_fresh.html#5',
        'http://www3.consumer.org.hk/pricewatch/supermarket/',
        'https://www.towngas.com/tc/Household/Customer-Services/Tariff',
        'https://www.clp.com.hk/zh/customer-service/tariff/residential-customers',
        'http://www.td.gov.hk/en/transport_in_hong_kong/public_transport/taxi/taxi_fare_of_hong_kong/'
    }
    util.get_day_string('-', offset=0)
    for pid in products:
        if products[pid]['cat'] == 1:
            seeds.add(
                'http://www.fmo.org.hk/fish-price?path=12_43_55&id=3&start=' +
                util.get_day_string('-', offset=1) + '&end=' +
                util.get_day_string('-', offset=0) + '&items%5B%5D=' +
                str(pid))
    # add mtr tasks
    r = requests.get(
        'http://www.mtr.com.hk/share/customer/js/jplannerdata_chi.js')
    mtr_lines = {}
    lname = u''
    lstart = None
    lend = None
    vl_res = re.findall(r'myValue.+', r.text)
    for vi in range(len(vl_res)):
        if re.match(r'.+lineValue\d+ = .+', vl_res[vi]):
            lns = re.findall(r'(?<=").+(?=")', vl_res[vi].split(';')[-2])
            if lns:
                ln = lns[0]
                sid = int(
                    re.findall(r'(?<=").+(?=")', vl_res[vi].split(';')[0])[0])
                if ln == 'tcline,drline':
                    ln = 'drline'
                if lstart and (ln != lname or vi == len(vl_res) -
                               1) and not re.match(r'line\d+', ln):
                    if lname in mtr_line_names:
                        mtr_lines[str(lstart) + '_' + str(lend)] = [
                            mtr_line_names[lname], lstart,
                            int(lend)
                        ]
                    else:
                        print 'UNKNOWN LINE ' + lname
                if not lstart or ln != lname:
                    lname = ln
                    lstart = sid
                lend = sid
    for l in mtr_lines:
        seeds.add(
            'http://www.mtr.com.hk/share/customer/include/getdata.php?&type=data&sid='
            + str(mtr_lines[l][1]) + '&eid=' + str(mtr_lines[l][2]))
    spider_prices = SpiderPrices(
        seed_urls=seeds,
        regs=[
            'http://www.afcd.gov.hk/tc_chi/agriculture/agr_fresh/agr_fresh.html',
            'http://www.vmo.org/tc/index/page_winterprice/',
            r'http://www\.fmo\.org\.hk/fish\-price\?path=12_43_55&id=3.+',
            'http://www3.consumer.org.hk/pricewatch/supermarket/',
            'https://www.towngas.com/tc/Household/Customer-Services/Tariff',
            'https://www.clp.com.hk/zh/customer-service/tariff/residential-customers',
            'http://www.td.gov.hk/en/transport_in_hong_kong/public_transport/taxi/taxi_fare_of_hong_kong/',
            'http://www\.mtr\.com\.hk/share/customer/include/getdata\.php\?&type=data&sid=\d+&eid=\d+'
        ],
        MAX_DEPTH=0,
        THREAD_NUM=1)
    spider_prices.mtr_lines = mtr_lines
    spider_prices.start()
    time.sleep(3)
Esempio n. 13
0
# -*- coding:utf-8 -*-
import common_news_spider
import util
import re
import threading

complete_pattern = re.compile(ur'http://.+')
news_prefix = 'http://www.zaobao.com'
news_id_pattern = re.compile(util.get_day_string() + ur'-\d+')


class SpiderZaobao(common_news_spider.CommonNewsSpider):

    ids = set()
    id_lock = threading.RLock()

    def get_url_of_link(self, link, doc, doc_url):

        u = link.attr('href')
        if u is not None:
            if not complete_pattern.match(u):
                u = news_prefix + u
        else:
            u = ''
        return u

    def normal_item_check(self, item, task, response):
        doc_url = task.url
        if item.id != '':
            with self.id_lock:
                if item.id in self.ids:
Esempio n. 14
0
class SpiderLocalpress(common_news_spider.CommonNewsSpider):
    def send_request(self, task):
        r = requests.get(task.url,
                         headers=headers,
                         timeout=self.RESPONSE_TIMEOUT_VALUE)
        task.fetched_at = util.get_now()
        return r

    def page_filter(self, doc, url):
        for reg_pattern in self.reg_patterns:
            if reg_pattern.match(url):
                t = doc('main time.updated').attr('datetime')
                t_stamp = util.get_timestamp_from_string(t)
                if t_stamp >= util.get_day_stamp(self.OFFSET):
                    return True
                return False
        return False

    def task_filter(self, doc, url, doc_url):
        for reg_pattern in self.reg_patterns:
            if reg_pattern.match(url):
                if not reg_pattern.match(doc_url):
                    return True
                t = doc('main time.updated').attr('datetime')
                t_stamp = util.get_timestamp_from_string(t)
                if t_stamp >= util.get_day_stamp(self.OFFSET):
                    return True
                return False
        return False

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = doc('h1').text()
        t = doc('main time.updated').attr('datetime')
        t_stamp = util.get_timestamp_from_string(t)
        category = doc('.entry-category a').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc,
                                                    'main .entry-content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'LocalPress'
        item.task_no = self.BATCH_NUMBER

    @classmethod
    def get_auto_configured_spider(cls, offset=0):
        localpress_seed = {'http://www.localpresshk.com/'}
        try:
            r = requests.get('http://www.localpresshk.com/', headers=headers)
            d = pq(r.text)
            for a in d('.menu-newss-container a').items():
                if a.attr('href') and complete_pattern.match(a.attr('href')):
                    localpress_seed.add(a.attr('href'))
        except Exception, e:
            raise e
        day_str = util.get_day_string('/', offset=offset)
        day_str = day_str[:-3]
        spider_localpress = SpiderLocalpress(
            'SpiderLocalpress',
            localpress_seed,
            {ur'http://www.localpresshk.com/' + day_str + '.+'},
            THREAD_NUM=5)
        spider_localpress.OFFSET = offset
        spider_localpress.BATCH_NUMBER = util.get_day_stamp() + 10390
        return spider_localpress
Esempio n. 15
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1:not(.articleDate)'})
        if title == '':
            title = doc('.bigtitlelink').text()
        if title == '':
            title = doc('font[size="5"]').text()
        t = util.get_day_string(offset=self.OFFSET)
        t_stamp = util.get_day_stamp(self.OFFSET)
        if t_stamp >= util.get_day_stamp(0):
            t_stamp = util.get_now()
        category = ''
        if cat_pattern.findall(task.url):
            cat_word = cat_pattern.findall(task.url)[0]
            category = doc('.' + cat_word).text()
        if category == '':
            if re.findall(cat_pattern_2, task.url):
                cat = re.findall(cat_pattern_2, task.url)[0]
                if cat in cat_dict:
                    category = cat_dict[cat]
        author = ''
        content = util.get_paragraphs_from_selector(
            doc, '.leadin p') + util.get_paragraphs_from_selector(
                doc, '#contentCTN-right p,h3')
        if doc('.summaryPara'):
            content = util.get_paragraphs_from_selector(
                doc, '.summaryPara') + util.get_paragraphs_from_selector(
                    doc, '.newsText p')
        if content == '':
            content = doc('tr p').text()
        if content == '':
            if doc('tr'):
                for tr in doc('tr').items():
                    for thd in tr('th,td').items():
                        content += u'{:<20}'.format(thd.text())
                    content += u'\n'

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'OrientalDaily'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.photo img').items():
            if img.attr('src') != '':
                media_u = prefix + img.attr('src')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
Esempio n. 16
0
# -*- coding:utf-8 -*-
import common_news_spider
import util
import re
import requests
import threading

prefix = u'http://orientaldaily.on.cc'
complete_pattern = re.compile(ur'(http|https)://.+')
today_date_pattern = re.compile(util.get_day_string())
charset_pattern = re.compile(ur'(?<=charset=).+?(?=")')
cat_pattern = re.compile(ur'(?<=cnt/).+?(?=/)')
cat_pattern_2 = re.compile(ur'(?<=\d\d\d\d/).+?(?=/)')
title_pattern = re.compile(r'(?<=<!--title-->).*?(?=<!--/title-->)')
cat_dict = {
    'new': u'要聞港聞',
    'fin': u'財經',
    'spt': u'體育',
    'ent': u'娛樂',
    'fea': u'副刊',
    'com': u'投訴',
    'fnd': u'慈善基金',
    'new_f': u'評論',
    'hrs': u'馬經'
}


class SpiderOriental(common_news_spider.CommonNewsSpider):

    charset_found = ''
    charset_lock = threading.RLock()