def get_auto_configured_spider(cls, offset=0): xinhua_seed = { 'http://www.news.cn/gangao/index.htm', 'http://www.news.cn/gangao/jsxw.htm' } util.add_hrefs('http://www.news.cn/gangao/index.htm', {'.nav.domPC a'}, xinhua_seed, news_prefix) day_str = util.get_day_string('-', offset=offset) day_str = day_str[:-3] + '/' + day_str[-2:] spider_xinhua = SpiderXinhua( 'SpiderXinhua', xinhua_seed, {ur'http://news\.xinhuanet\.com/gangao/' + day_str + '.+'}, THREAD_NUM=5) spider_xinhua.OFFSET = offset spider_xinhua.BATCH_NUMBER = util.get_day_stamp() + 10430 return spider_xinhua
def get_auto_configured_spider(cls, offset=0): # pt_index = pq(requests.get('http://www.passiontimes.hk/4.0/index.php').text) # pt_seed = set([]) # cat_pattern = re.compile(ur'/4.0/category/.*') # for cat in pt_index('div.footer-siteMap a').items(): # if cat_pattern.match(cat.attr('href').encode('utf-8')): # pt_seed.add('http://www.passiontimes.hk' + str(cat.attr('href'))) pt_seed = { 'http://www.passiontimes.hk/4.0/index.php', 'http://www.passiontimes.hk/4.0/category/3/19', 'http://www.passiontimes.hk/4.0/category/1/5', 'http://www.passiontimes.hk/4.0/category/1/4', 'http://www.passiontimes.hk/4.0/category/1/7', 'http://www.passiontimes.hk/4.0/category/2/62', 'http://www.passiontimes.hk/4.0/category/1/1', 'http://www.passiontimes.hk/4.0/category/1/3', 'http://www.passiontimes.hk/4.0/category/1/2', 'http://www.passiontimes.hk/4.0/category/1/6', 'http://www.passiontimes.hk/4.0/category/2/18', 'http://www.passiontimes.hk/4.0/category/2/14', 'http://www.passiontimes.hk/4.0/category/2/15', 'http://www.passiontimes.hk/4.0/category/2/16', 'http://www.passiontimes.hk/4.0/category/2/17', 'http://www.passiontimes.hk/4.0/category/2/10', 'http://www.passiontimes.hk/4.0/category/2/11', 'http://www.passiontimes.hk/4.0/category/2/12', 'http://www.passiontimes.hk/4.0/category/2/13', 'http://www.passiontimes.hk/4.0/category/2/8', 'http://www.passiontimes.hk/4.0/category/2/9', 'http://www.passiontimes.hk/4.0/category/3/125', 'http://www.passiontimes.hk/4.0/category/1/37', 'http://www.passiontimes.hk/4.0/category/2/135', 'http://www.passiontimes.hk/4.0/category/1/124', 'http://www.passiontimes.hk/4.0/category/3/22', 'http://www.passiontimes.hk/4.0/category/3/23', 'http://www.passiontimes.hk/4.0/category/3/20', 'http://www.passiontimes.hk/4.0/category/3/21', 'http://www.passiontimes.hk/4.0/category/3/24' } pt_spider = SpiderPt('PtSpider', pt_seed, {r'http://www\.passiontimes\.hk/article/' + util.get_day_string('-', 'american', offset=offset) + '.*'}, THREAD_NUM=10) pt_spider.OFFSET = offset pt_spider.BATCH_NUMBER = util.get_day_stamp() + 10001 return pt_spider
def get_auto_configured_spider(cls, offset=0): initium_seed = {'https://theinitium.com/'} util.add_hrefs(url='https://theinitium.com/', selectors={'div.left-nav-top li a'}, seeds=initium_seed, prefix=prefix) initium_reg = { ur'https://theinitium\.com/article/' + util.get_day_string(offset=offset) + '-.+', ur'http://feeds\.initium.+' } spider_initium = SpiderInitium('SpiderInitium', initium_seed, initium_reg, THREAD_NUM=10) spider_initium.BATCH_NUMBER = util.get_day_stamp() + 10190 spider_initium.OFFSET = offset return spider_initium
def get_auto_configured_spider(cls, offset=1): day_str = util.get_day_string('.', 'inverse', offset=offset) commercial_seed = { 'http://www.881903.com/Page/ZH-TW/news.aspx?sdate=' + day_str + '&csid=261_341' } util.add_hrefs(url='http://www.881903.com/Page/ZH-TW/news.aspx?' + day_str + '&csid=261_341', seeds=commercial_seed, selectors={'#newsCategoryTab a'}, prefix=cat_prefix) _seed = copy.deepcopy(commercial_seed) for seed in _seed: if cat_page_pattern.match(seed): r = util.get_safe_response(seed) if r: d = pq(r.text) if re.findall(total_page_pattern, d('.Font_Article_CH').text()): total_page = int( re.findall(total_page_pattern, d('.Font_Article_CH').text())[0]) for i in range(total_page): commercial_seed.add(seed + '&page=' + str(i + 1)) '''' r = requests.get('http://www.881903.com/Page/ZH-TW/index.aspx') d = pq(r.text) for a in d('.header2012 ul li a').items(): if a.attr('href'): u = a.attr('href') if not complete_pattern.match(u): if incomplete_pattern.match(u): u = prefix + u commercial_seed.add(u) ''' commercial_reg = {ur'http://www\.881903\.com/.+detail.*'} spider_commercial = SpiderCommercialRadio('SpiderCommercialRadio', commercial_seed, commercial_reg, THREAD_NUM=10) spider_commercial.BATCH_NUMBER = util.get_day_stamp() + 10260 spider_commercial.OFFSET = offset # spider_commercial.MAX_DEPTH = 5 return spider_commercial
def get_auto_configured_spider(cls, offset=0): current_day_string = util.get_day_string(offset=offset) day_string = current_day_string[0:4] + '-' + current_day_string[ 4:6] + '/' + current_day_string[6:8] index_prefix = 'http://www.macaodaily.com/html/' + day_string + '/' macao_seed = {index_prefix + 'node_2.htm'} _index = requests.get(index_prefix + 'node_2.htm') d = pq(_index.text) for a in d('table.unnamed1 a').items(): if a.attr('href') is not None: macao_seed.add(index_prefix + a.attr('href')) spider_macao = SpiderMacao('SpiderMacao', macao_seed, { ur'http://www\.macaodaily\.com/html/' + day_string + ur'/content_.*' }, THREAD_NUM=10) spider_macao.BATCH_NUMBER = util.get_day_stamp() + 10080 spider_macao.OFFSET = offset return spider_macao
def get_auto_configured_spider(cls, offset=0): hkfp_seed = { 'https://www.hongkongfp.com/' 'https://www.hongkongfp.com/hong-kong-news/', 'https://www.hongkongfp.com/china-news/', 'https://www.hongkongfp.com/comment-analysis/', 'https://www.hongkongfp.com/hkfp-voices/', 'https://www.hongkongfp.com/category/hkfp-lens/' } hkfp_reg = { ur'https://www.hongkongfp.com/' + util.get_day_string(interval_str='/', offset=offset) + u'/.+' } spider_hkfp = SpiderHKFP('SpiderHKFP', hkfp_seed, hkfp_reg, THREAD_NUM=10, MAX_DEPTH=1) spider_hkfp.BATCH_NUMBER = util.get_day_stamp() + 10720 spider_hkfp.OFFSET = offset return spider_hkfp
def normal_item_solver(self, item, task, response): with self.charset_lock: if self.charset == '': self.charset = self.find_charset(response) response.encoding = self.charset doc = self.get_doc(response) if doc: title = doc('title').text() if title == '': if re.findall(r'(?<=<title>).+(?=</title>)', response.text): title = re.findall(r'(?<=<title>).+(?=</title>)', response.text)[0] t = util.get_day_string(offset=self.OFFSET) t_stamp = util.get_day_stamp(offset=self.OFFSET) + time.localtime( ).tm_min * 60 + time.localtime().tm_sec category = '' author = '' content = util.get_paragraphs_from_selector(doc, '#pressrelease p') if content == '': content = util.get_paragraphs_from_selector( doc, '#pressrelease') if content == '': content = util.get_paragraphs_from_selector(doc, 'td p') content = re.sub(ur'*+\n', '', content) item.raw = response.text item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'GovInfoNews' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = doc('h1').text() t = util.get_day_string(offset=self.OFFSET) t_stamp = util.get_now() category = '' author = '' content = util.get_paragraphs_from_selector(doc, '#content p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'Locpg' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = doc('h1').text() t = util.get_day_string(offset=self.OFFSET) t_stamp = util.get_timestamp_from_string(t, time_format='%Y%m%d') + time.localtime().tm_hour*3600 + time.localtime().tm_min*60 category = re.split(r'[:\s]', doc('.ban_t li').text())[1] author = '' content = util.get_paragraphs_from_selector(doc, '#ozoom p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'People' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u' \|.*') t = util.get_day_string(offset=self.OFFSET) t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_hour * 3600 + time.localtime().tm_min * 60 category = doc('meta[name=subsection]').attr('content') author = '' content = util.get_paragraphs_from_selector(doc, '#masterContent p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'AppleNews' item.task_no = self.BATCH_NUMBER if util.within_active_interval(6, 20 * 60): _comments = util.get_filtered_facebook_comments_data( '367495573302576', doc('meta[property="og:url"]').attr('content'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def get_auto_configured_spider(cls, offset=0): hkej_seed = {'http://www2.hkej.com/instantnews', 'http://www.hkej.com/template/landing11/jsp/main.jsp', 'http://www1.hkej.com/dailynews/toc?date='+util.get_day_string('-', offset=offset)} # util.add_hrefs('http://www.hkej.com/template/landing11/jsp/main.jsp', {'a'}, hkej_seed, seed_patterns={re.compile(ur'http://www.*hkej\.com/.+')}) # ** currently the reg of the pages is only for 'instant news' hkej_reg = {ur'http://www.*?\.hkej\.com/instantnews.*article/.+', ur'http://www1\.hkej\.com/.*dailynews/.*article/.+'} spider_hkej = SpiderHKEJ('SpiderHKEJ', hkej_seed, hkej_reg, THREAD_NUM=10, MAX_DEPTH=1) spider_hkej.BATCH_NUMBER = util.get_day_stamp() + 10150 spider_hkej.OFFSET = offset return spider_hkej
def main(): seeds = { 'http://www.vmo.org/tc/index/page_winterprice/', 'http://www.afcd.gov.hk/tc_chi/agriculture/agr_fresh/agr_fresh.html#5', 'http://www3.consumer.org.hk/pricewatch/supermarket/', 'https://www.towngas.com/tc/Household/Customer-Services/Tariff', 'https://www.clp.com.hk/zh/customer-service/tariff/residential-customers', 'http://www.td.gov.hk/en/transport_in_hong_kong/public_transport/taxi/taxi_fare_of_hong_kong/' } util.get_day_string('-', offset=0) for pid in products: if products[pid]['cat'] == 1: seeds.add( 'http://www.fmo.org.hk/fish-price?path=12_43_55&id=3&start=' + util.get_day_string('-', offset=1) + '&end=' + util.get_day_string('-', offset=0) + '&items%5B%5D=' + str(pid)) # add mtr tasks r = requests.get( 'http://www.mtr.com.hk/share/customer/js/jplannerdata_chi.js') mtr_lines = {} lname = u'' lstart = None lend = None vl_res = re.findall(r'myValue.+', r.text) for vi in range(len(vl_res)): if re.match(r'.+lineValue\d+ = .+', vl_res[vi]): lns = re.findall(r'(?<=").+(?=")', vl_res[vi].split(';')[-2]) if lns: ln = lns[0] sid = int( re.findall(r'(?<=").+(?=")', vl_res[vi].split(';')[0])[0]) if ln == 'tcline,drline': ln = 'drline' if lstart and (ln != lname or vi == len(vl_res) - 1) and not re.match(r'line\d+', ln): if lname in mtr_line_names: mtr_lines[str(lstart) + '_' + str(lend)] = [ mtr_line_names[lname], lstart, int(lend) ] else: print 'UNKNOWN LINE ' + lname if not lstart or ln != lname: lname = ln lstart = sid lend = sid for l in mtr_lines: seeds.add( 'http://www.mtr.com.hk/share/customer/include/getdata.php?&type=data&sid=' + str(mtr_lines[l][1]) + '&eid=' + str(mtr_lines[l][2])) spider_prices = SpiderPrices( seed_urls=seeds, regs=[ 'http://www.afcd.gov.hk/tc_chi/agriculture/agr_fresh/agr_fresh.html', 'http://www.vmo.org/tc/index/page_winterprice/', r'http://www\.fmo\.org\.hk/fish\-price\?path=12_43_55&id=3.+', 'http://www3.consumer.org.hk/pricewatch/supermarket/', 'https://www.towngas.com/tc/Household/Customer-Services/Tariff', 'https://www.clp.com.hk/zh/customer-service/tariff/residential-customers', 'http://www.td.gov.hk/en/transport_in_hong_kong/public_transport/taxi/taxi_fare_of_hong_kong/', 'http://www\.mtr\.com\.hk/share/customer/include/getdata\.php\?&type=data&sid=\d+&eid=\d+' ], MAX_DEPTH=0, THREAD_NUM=1) spider_prices.mtr_lines = mtr_lines spider_prices.start() time.sleep(3)
# -*- coding:utf-8 -*- import common_news_spider import util import re import threading complete_pattern = re.compile(ur'http://.+') news_prefix = 'http://www.zaobao.com' news_id_pattern = re.compile(util.get_day_string() + ur'-\d+') class SpiderZaobao(common_news_spider.CommonNewsSpider): ids = set() id_lock = threading.RLock() def get_url_of_link(self, link, doc, doc_url): u = link.attr('href') if u is not None: if not complete_pattern.match(u): u = news_prefix + u else: u = '' return u def normal_item_check(self, item, task, response): doc_url = task.url if item.id != '': with self.id_lock: if item.id in self.ids:
class SpiderLocalpress(common_news_spider.CommonNewsSpider): def send_request(self, task): r = requests.get(task.url, headers=headers, timeout=self.RESPONSE_TIMEOUT_VALUE) task.fetched_at = util.get_now() return r def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = doc('main time.updated').attr('datetime') t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False def task_filter(self, doc, url, doc_url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if not reg_pattern.match(doc_url): return True t = doc('main time.updated').attr('datetime') t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = doc('h1').text() t = doc('main time.updated').attr('datetime') t_stamp = util.get_timestamp_from_string(t) category = doc('.entry-category a').text() author = '' content = util.get_paragraphs_from_selector(doc, 'main .entry-content p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'LocalPress' item.task_no = self.BATCH_NUMBER @classmethod def get_auto_configured_spider(cls, offset=0): localpress_seed = {'http://www.localpresshk.com/'} try: r = requests.get('http://www.localpresshk.com/', headers=headers) d = pq(r.text) for a in d('.menu-newss-container a').items(): if a.attr('href') and complete_pattern.match(a.attr('href')): localpress_seed.add(a.attr('href')) except Exception, e: raise e day_str = util.get_day_string('/', offset=offset) day_str = day_str[:-3] spider_localpress = SpiderLocalpress( 'SpiderLocalpress', localpress_seed, {ur'http://www.localpresshk.com/' + day_str + '.+'}, THREAD_NUM=5) spider_localpress.OFFSET = offset spider_localpress.BATCH_NUMBER = util.get_day_stamp() + 10390 return spider_localpress
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1:not(.articleDate)'}) if title == '': title = doc('.bigtitlelink').text() if title == '': title = doc('font[size="5"]').text() t = util.get_day_string(offset=self.OFFSET) t_stamp = util.get_day_stamp(self.OFFSET) if t_stamp >= util.get_day_stamp(0): t_stamp = util.get_now() category = '' if cat_pattern.findall(task.url): cat_word = cat_pattern.findall(task.url)[0] category = doc('.' + cat_word).text() if category == '': if re.findall(cat_pattern_2, task.url): cat = re.findall(cat_pattern_2, task.url)[0] if cat in cat_dict: category = cat_dict[cat] author = '' content = util.get_paragraphs_from_selector( doc, '.leadin p') + util.get_paragraphs_from_selector( doc, '#contentCTN-right p,h3') if doc('.summaryPara'): content = util.get_paragraphs_from_selector( doc, '.summaryPara') + util.get_paragraphs_from_selector( doc, '.newsText p') if content == '': content = doc('tr p').text() if content == '': if doc('tr'): for tr in doc('tr').items(): for thd in tr('th,td').items(): content += u'{:<20}'.format(thd.text()) content += u'\n' item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'OrientalDaily' item.task_no = self.BATCH_NUMBER for img in doc('.photo img').items(): if img.attr('src') != '': media_u = prefix + img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
# -*- coding:utf-8 -*- import common_news_spider import util import re import requests import threading prefix = u'http://orientaldaily.on.cc' complete_pattern = re.compile(ur'(http|https)://.+') today_date_pattern = re.compile(util.get_day_string()) charset_pattern = re.compile(ur'(?<=charset=).+?(?=")') cat_pattern = re.compile(ur'(?<=cnt/).+?(?=/)') cat_pattern_2 = re.compile(ur'(?<=\d\d\d\d/).+?(?=/)') title_pattern = re.compile(r'(?<=<!--title-->).*?(?=<!--/title-->)') cat_dict = { 'new': u'要聞港聞', 'fin': u'財經', 'spt': u'體育', 'ent': u'娛樂', 'fea': u'副刊', 'com': u'投訴', 'fnd': u'慈善基金', 'new_f': u'評論', 'hrs': u'馬經' } class SpiderOriental(common_news_spider.CommonNewsSpider): charset_found = '' charset_lock = threading.RLock()