def _crawl_mwnation_com(crawler, out): urls = set() index = crawler.fetch_content('http://mwnation.com/section/chichewa/') pages = re.findall(r'/section/chichewa/page/(\d+)/', index) num_pages = max([int(p) for p in pages]) for page in range (1, num_pages + 1): url = 'http://mwnation.com/section/chichewa/' if page > 1: url += 'page/%d/' % page doc = crawler.fetch_content(url) urls.update(re.findall(r'<a href="([^"]+?)">Continue Reading', doc)) for url in sorted(urls): doc = crawler.fetch_content(url) pubdate = re.search( r'<meta property="article:published_time" content="([^"]+)"', doc) pubdate = pubdate.group(1) if pubdate is not None else None title = extract('<h1 class="entry-title" itemprop="headline">', '</h1>', doc) or '' body = extract('<div class="entry-content" itemprop="articleBody">', '<footer ', doc) or '' paras = clean_paragraphs(title + '<br/>' + body) text = '\n'.join(paras) + '\n' if text.find(' the ') >= 0: # likely English continue if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(text)
def _crawl_kauno_diena_lt(crawler, out): urls = {} for i in range(1, 6): url = 'http://kauno.diena.lt/sitemap/kd/sitemap%d.xml' % i urls.update(crawler.fetch_sitemap(url)) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue try: html = doc.content.decode('utf-8') except UnicodeDecodeError: continue title = extract('<h1 class="title" id="page-title">', '</h1>', html) title = cleantext(title if title else '') body = extract("<span itemprop='articleBody'>", '</div>', html) or '' paras = [] for p in clean_paragraphs('%s<br/>%s' % (title, body)): if 'MicrosoftInternetExplorer4' in p: break paras.append(p) pubdate = re.search( r'<span\s+property="dc:date\s+dc:created"\s+content="(20[^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_observador_pt(crawler, out): urls = set() for author_page in sorted( re.findall( r'href="(https?://observador.pt/perfil/[a-zA-Z_\-0-9]+/)"', crawler.fetch_content('http://observador.pt/autores/'))): html = crawler.fetch_content(author_page) urls.update( re.findall( r'href="(https?://observador.pt/20\d{2}/\d{2}/\d{2}/[^"]+)"', html)) for url in sorted(urls): try: html = crawler.fetch_content(url) except UnicodeDecodeError: continue title = re.search(r'<meta property="og:title" content="([^"]+)"', html) title = title.group(1) or '' pubdate = re.search(r'"dateModified":"([^"]+)"', html) pubdate = pubdate.group(1) or None lead = extract('<div class="lead">', '</div>', html) or '' content = extract('<div class="content">', '<h1>', html) or '' text = '\n'.join(clean_paragraphs('<p>'.join([title, lead, content]))) text = text.split('\nContinuar a ler')[0] text = text.split('\nLer mais')[0] text = text.split('\nPartilhe')[0] text = text.split('\nComente')[0] if text: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(text) out.write('\n')
def crawl_blogspot(crawler, out, host): sitemap = crawler.fetch_sitemap('https://%s/sitemap.xml' % host) pubdate_regex = re.compile( r"<abbr class='published' title='([^']*)'>[^<]*</abbr>") for url in sorted(sitemap.keys()): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[url] title = re.search(r"<meta content='([^']+)' property='og:title'/>", html) title = title.group(1) if title else '' post = extract("<div class='post-body entry-content'>", "<div class='post-footer'>", html) paras = clean_paragraphs(title + '<br/>' + post) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: Blog\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_tuairisc_ie(crawler, out): sitemap = crawler.fetch_sitemap('https://tuairisc.ie/sitemap.xml') pubdate_regex = re.compile( r'<time datetime="(20\d\d-\d\d-\d\d)\s+(\d\d:\d\d)" ' r'itemprop="datePublished">') for url in sorted(sitemap.keys()): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = extract('<h1 class="title article--full__title">', '</h1>', html) or '' pubdate_match = pubdate_regex.search(html) if pubdate_match: pubdate = '%sT%s:00Z' % (pubdate_match.group(1), pubdate_match.group(2)) else: pubdate = sitemap[url] body = extract( '<div class="article--full__content" itemprop="articleBody">', '</article>', html) if not body: continue paras = clean_paragraphs(title + '<p/>' + body) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_ainm_ie(crawler, out): links = set() for let in map(chr, range(65, 91)): idxres = crawler.fetch('https://www.ainm.ie/Abc.aspx?Letter=%s' % let) if idxres.status != 200: continue idxhtml = idxres.content.decode('utf-8') index = extract('<div id="pageContent" role="main">', '<!-- .contentWrapper-->', idxhtml) for link in re.findall(r'<a href="(Bio.aspx\?ID=[^"]+?)">', index): links.add('https://www.ainm.ie/%s' % link) for url in sorted(links): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html) title = title.group(1).split('|')[0] if title else '' body = extract('<div class="article">', '<!-- .contentWrapper-->', html) or '' body = body.split('<div id="machines"')[0] paras = clean_paragraphs(title + '<br/>' + body) pubdate = fetchresult.headers.get('Last-Modified') if paras: out.write('# Location: %s\n' % url) out.write('# Genre: Biography\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_iltirreno_gelocal_it(crawler, out): urls = set() for category in ('italia-mondo', 'focus/toscana-economia', 'empoli/cronaca', 'grosseto/cronaca', 'livorno/cronaca', 'livorno/dagli-enti', 'lucca/cronaca', 'pisa/cronaca', 'prato/cronaca', 'versilia/cronaca'): urls.update(_find_tirreno_urls(crawler, category)) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue content = doc.content.decode('utf-8') header = extract('<h1 itemprop="headline name">', '<span itemprop="author"', content) or '' body = extract('<span itemprop="articleBody" >', '©', content) or '' paras = clean_paragraphs('%s<p/>%s' % (header, body)) text = '\n'.join(paras) for sep in ('Tags\n', 'Redazione | Scriveteci', 'TrovaRistorante', '<a href="', 'I COMMENTI DEI LETTORI', '©RIPRODUZIONE'): text = text.split(sep)[0] paras = text.splitlines() pubdate = re.search( r'<time itemprop="datePublished" content="([^"]+)"', content) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_meoneile_ie(crawler, out): sitemap = crawler.fetch_sitemap('https://meoneile.ie/sitemap.xml') for url in sorted(sitemap.keys()): if url == 'https://meoneile.ie/': continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = extract(r'<title>', '</title>', html).strip() title = title.split('<')[0].strip() if title else '' video = re.search( r"<iframe.*src='(//player.vimeo.com/video/[0-9]+)[^>]*></iframe>", html) body = extract("<div class='article-content'>", '</article>', html) or '' byline = extract("<div class='byline'>", '</span>', html) or '' byline = _byline_to_pubdate(byline) if body.find('<strong>%s</strong>' % title) >= 0: title = '' paras = clean_paragraphs(title + '<br/>' + body) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if video: out.write('# Video: https:%s\n' % video.group(1)) if byline: out.write('# Publication-Date: %s\n' % byline) for para in paras: if para == 'Roinn': continue else: out.write(para + '\n')
def _crawl_telegraaf_nl(crawler, out): sitemap = crawler.fetch_sitemap( 'http://www.telegraaf.nl/sitemap.xml', subsitemap_filter=_should_fetch_telegraaf_sitemap) for url in sorted(sitemap): doc = crawler.fetch(urlencode(url)) if doc.status != 200: continue html = doc.content.decode('utf-8') title = re.search( r'<meta [a-zA-Z\-="]* property="og:title" content="(.+?)"', html) title = title.group(1) if title else '' pubdate = re.search(r'"publishDate":"([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None text = extract( 'data-element="ArticlePage-intro">', '<div class="flex" data-element="ArticlePage-socialShare-root">', html) or '' paras = clean_paragraphs(title + '<br/>' + text) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_vatantat_ru(crawler, out): index = crawler.fetch_content('http://www.vatantat.ru/') last = max([int(p) for p in re.findall(r'index\.php\?pg=(\d+?)"', index)]) for page in range(2, last + 1): url = 'http://www.vatantat.ru/index.php?pg=%d' % page doc = crawler.fetch(url) if doc.status != 200: continue content = doc.content.decode('utf-8') html = extract('<p><span style="font-size: large;"><strong>', '<span style="font-size: 80%; font-weight: bold;">', content) if not html: continue html = html.split('(“Ватаным Татарстан”,')[0] html = html.split('<script>')[0] paras = clean_paragraphs(html) if not paras: continue pubdate = re.search( r'Татарстан”, /№ (none|\d+), ' r'(\d\d)\.(\d\d)\.(20\d\d)/', content) if pubdate is not None: pubdate = ('%s-%s-%s' % (pubdate.group(4), pubdate.group(3), pubdate.group(2))) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_kyym_ru(crawler, out): index = crawler.fetch_content('http://www.kyym.ru/') last = max([ int(s) for s in re.findall(r'href="/index\.php\?start=(\d+?)"', index) ]) urls = set() for page in range(1, last + 1): doc = crawler.fetch_content('http://www.kyym.ru/index.php?start=%d' % page) for path in re.findall(r'<a href="(/index\.php\?view=article&[^"]+?)"', doc): urls.add('http://www.kyym.ru' + path.replace('&', '&')) for url in sorted(urls): doc = crawler.fetch_content(url) html = extract('<div class="news_item_article">', '<!--end news item -->', doc) if not html: continue paras = clean_paragraphs(html) if not paras: continue pubdate = re.search( r'<span class="createdate"><!-- date and by -->' r'\s*(\d{1,2}).(\d{2}).(20\d{2})', doc, flags=re.DOTALL) if pubdate is not None: pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2), pubdate.group(1)) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_peig_ie(crawler, out): crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)) sitemap = crawler.fetch_sitemap('https://peig.ie/sitemap_index.xml', subsitemap_filter=_peig_filter_robots) def peig_cat(page): if page.find('/imeachtai/') >= 0: return 'Events' elif page.find('peig.ie/20') >= 0: return 'News' elif page.find('/fol%C3%BAntais/') >= 0: return 'Job listings' else: return '' # Peig.ie has a lot of posts from other sites def skip_page(site): if site.find('//nos.ie/') >= 0: return True elif site.find('//tuairisc.ie/') >= 0: return True elif site.find('//meoneile.ie/') >= 0: return True else: return False for url in sorted(sitemap.keys()): if url == 'https://peig.ie/': continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html) title = title.group(1).split('|')[0].strip() if title else '' read_more = re.search(r'<a.*href="([^"]+")[^>]*>Níos mó</a>', html) if read_more and skip_page(read_more.group(1)): continue if '<meta property="article:modified_time"' in html: date = re.search( r'<meta property="article:modified_time" content="([^"]+)"', html).group(1) else: date = re.search(r'"datePublished":"([^"]+)"', html).group(1) body = extract('<div class="uk-margin-medium-top" property="text">', '<ul class="uk-pagination', html) or '' paras = clean_paragraphs(title + '<br/>' + body) genre = peig_cat(url) if paras: out.write('# Location: %s\n' % url) if genre: out.write('# Genre: %s\n' % genre) if date: out.write('# Publication-Date: %s\n' % date) out.write('\n'.join(paras) + '\n') crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1))
def _crawl_asymptotejournal_com(crawler, out): url = ('https://www.asymptotejournal.com/nonfiction/' 'shibu-tudu-memories-of-the-kirta-dangra/santhali/') html = crawler.fetch_content(url) content = extract('<!-- article content -->', '<img src="/images/end-logo-black.gif"', html) out.write('# Location: %s\n' % url) out.write('# Genre: Fiction\n') paras = clean_paragraphs(content) paras = [p for p in paras if p[0] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'] out.write('\n'.join(paras) + '\n')
def crawl_pl_usembassy_gov(crawler, out): sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml') trans_regex = re.compile( r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"' ) pubdate_regex = re.compile( r'<meta property="article:published_time" content="([^"]*)"' ) links = set() for key in sorted(sitemap.keys()): if _pl_usembassy_gov_path(key): links.add(key) for link in sorted(links): result = crawler.fetch(link) if result.status != 200: continue html = result.content.decode('utf-8') title = extract('<title>', '</title>', html) title = title if title else '' title = title.split(' | ')[0] if ' | ' in title else title pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None trans_match = trans_regex.search(html) trans = trans_match.group(1) if trans_match else None if pubdate is None: pubdate = result.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[link] exstart = '<div class="entry-content">' exstart2 = '<div class="mo-page-content">' exend = '<!-- AddThis Advanced Settings above via filter on the_content -->' exstart = exstart2 if exstart2 in html else exstart content = extract(exstart, exend, html) cleanparas = clean_paragraphs(content) if content else None # Don't repeat the title if it's the only text content cleantitle = cleantext(title) if cleanparas: if len(cleanparas) == 1 and cleanparas[0] == cleantitle: paras = [cleantitle] else: paras = [cleantitle] + cleanparas else: paras = [cleantitle] # There are quite a few media pages whose only text is the filename # this, conveniently, is typically also the post's name if len(paras) == 1 and paras[0].lower() in urlpath(link).lower(): continue if paras: out.write('# Location: %s\n' % link) out.write('# Genre: Diplomatic\n') if trans: out.write('# Translation: %s\n' % trans) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_pravda_sk(crawler, out): for url in sorted(_find_urls_on_pravda_sk(crawler)): doc = crawler.fetch_content(url) title = re.search(r'<h1[^>]*>(.+?)</h1>', doc) title = title.group(1) if title else '' pubdate = re.search( '<meta property="article:published_time" content="(.+?)"', doc) pubdate = pubdate.group(1) if pubdate else None text = extract('<div class="article-detail-perex">', '<div class="clearfix">', doc) or '' paras = clean_paragraphs(title + '<br/>' + text) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_utusan_borneo_berita_iban(crawler, out): for url in sorted(_find_urls_utusan_borneo_berita_iban(crawler)): doc = crawler.fetch_content(url) title = re.search(r'<meta property="og:title" content="(.+?)"', doc) title = title.group(1) if title else '' paras = clean_paragraphs('<h1>%s</h1>' % title + extract('<p>', '<footer>', doc)) pubdate = re.search( r'<meta property="article:published_time" content="([\d\-]+)"', doc) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _rte_cleanall(html): section_article_regex = re.compile( r'<section[^>]+itemprop="articleBody"[^>]*>') search = section_article_regex.search(html) out = [] if search: body = extract(search.group(0), '</section>', html) for para in clean_paragraphs(body): if _rte_writable_paragraph(para): out.append(para) return '\n'.join(out) for paragraph in re.findall(r'<p>(.+?)</p>', html): cleaned = cleantext(paragraph) if _rte_writable_paragraph(cleaned): out.append(cleaned) else: continue return '\n'.join(out)
def _crawl_jornalet_com(crawler, out): for url in sorted(_find_urls_jornalet_com(crawler)): try: html = crawler.fetch_content(url) except UnicodeDecodeError: continue title = re.search(r'<meta property="og:title" content="([^"]+)"', html) title = title.group(1) if title else '' subtitle = extract('<h4 class="subtitol">', '</h4>', html) or '' content = extract('<p class="contingut">', '<hr', html) or '' paras = clean_paragraphs('\n'.join( ['<p>%s</p>' % p for p in (title, subtitle, content) if p])) paras = [p for p in paras if p.find('Abonar los amics de Jornalet') < 0] if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('\n'.join(paras) + '\n')
def crawl_coislife_ie(crawler, out): links = set() for num in range(1, 12): if num > 1: listurl = 'https://www.coislife.ie/product-category/ga/page/%s/' % num else: listurl = 'https://www.coislife.ie/product-category/ga/' idxres = crawler.fetch(listurl) if idxres.status != 200: continue idxhtml = idxres.content.decode('utf-8') index = extract('<div class="products-archive--products">', '<nav class="woocommerce-pagination">', idxhtml) for link in re.findall( r'<a href="(https://www.coislife.ie/product/[^"]+?)">', index): links.add(link) for url in sorted(links): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html) title = title.group(1).split('–')[0].strip() if title else '' desc = re.search(r'<meta property="og:description" content="([^"]+?)"', html) desc = cleantext(desc.group(1)) body = extract( '<div class="tab-content">', '<div class="entry-content in fade tab-pane" id="tab-additional_information">', html) or '' paras = clean_paragraphs(title + '<br/>' + body) pubdate = fetchresult.headers.get('Last-Modified') if paras: out.write('# Location: %s\n' % url) out.write('# Genre: Commerce\n') if desc: out.write('# Description: %s\n' % desc) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for para in paras: if para.find('Léigh sliocht as an leabhar') >= 0: continue else: out.write(para + '\n')
def _crawl_news_mn(crawler, out): index = crawler.fetch_content( 'https://www.news.mn/api/v1/mongo/getNewsByLang?id=3') for i in sorted(set(item['newsId'] for item in json.loads(index))): url = 'https://www.news.mn/api/v1/news/%d/-1' % i doc = json.loads(crawler.fetch_content(url)) pubDate = doc.get('publishDate', doc.get('createdAt')) if pubDate: pubDate = datetime.utcfromtimestamp(pubDate / 1000.0).isoformat() title = doc.get('title', '') html = '<h1>%s</h1>%s' % (doc.get('title', ''), doc.get( 'infoHtml', '')) text = '\n'.join(clean_paragraphs(html)) text = ''.join([QAGAN_CHARMAP.get(c, '') for c in text]) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubDate: out.write('# Publication-Date: %sZ\n' % pubDate) out.write(text + '\n')
def crawl_forasnagaeilge_ie(crawler, out): sitemap = crawler.fetch_sitemap( 'https://www.forasnagaeilge.ie/sitemap_index.xml') pubdate_regex = re.compile(r'"datePublished":"([^"]+)",') for url in sorted(sitemap.keys()): orig_url = url if '?lang=en' in url: ga_url = url.replace('?lang=en', '') if ga_url in sitemap.keys(): continue if '/blog-en/' in url: continue if '/corporate-information/' in url: continue if '/torthai-cuardaigh/' in url: continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') if '<html class="no-js" lang="en">' in html: continue title = extract('<title>', ' - www.forasnagaeilge.ie</title>', html) or '' pubdate_match = pubdate_regex.search(html) if pubdate_match: pubdate = pubdate_match.group(1) else: pubdate = sitemap.get(url) or sitemap[orig_url] body = extract('<div id="main" class="container">', '</div><!-- /.content -->', html) if not body: continue paras = clean_paragraphs(body) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Title: %s\n' % title) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_disom_khobor(crawler, out): for url in sorted(set(re.findall( r'http://wesanthals.tripod.com/(?:disomk02|DK-\d+)/[^"\']+', crawler.fetch('http://wesanthals.tripod.com/id43.html').content))): doc = crawler.fetch(url) if doc.status != 200: continue assert 'charset=ISO-8859-1' in doc.content html = extract('sahta 1', '<hr', doc.content.decode('ISO-8859-1')) if not html: continue pubdate = max([_parse_date(d) for d in re.findall(r'\d\d/\d\d/\d{2,4}', html)]) html = html.replace(' ,', ',').replace(',', ', ') html = html.replace('(', ' (').replace(')', ') ') html = html.replace(') ,', '),') text = '\n'.join([_to_unicode(p) for p in clean_paragraphs(html)]) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Publication-Date: %s\n' % pubdate) out.write(text + '\n')
def _crawl_raestdzinad_ru(crawler, out): urls = crawler.fetch_sitemap( urlencode('https://растдзинад.рф/sitemap_index.xml')) for url in sorted(urls): if re.search(r'/20\d{2}/', url) is None: continue html = crawler.fetch_content(url) title = extract('<h1 class="entry-title">', '</h1>', html) or '' text = extract('<div class="td-post-content">', '<footer>', html) or '' text = text.split('<div class = "evc-social-likes"')[0] pubdate = re.search( r'<meta property="article:published_time" content="([^"]+)"', html) if pubdate: pubdate = pubdate.group(1) paras = clean_paragraphs('%s<p/>%s' % (title, text)) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_eestikirik_ee(crawler, out): for url in sorted(_find_urls_eestikirik_ee(crawler)): doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = extract('<h1 class="entry_title">', '</h1>', html) or '' entry = extract('<div class="entry">', '<div style="min-height:33px;"', html) or '' pubdate = re.search('(\d{1,2})\.(\d{1,2})\.(20\d{2})', extract('<div id="content">', '</small>', html)) if pubdate is not None: pubdate = '%04d-%02d-%02d' % (int( pubdate.group(3)), int(pubdate.group(2)), int( pubdate.group(1))) paras = clean_paragraphs('%s<br/>%s' % (title, entry)) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_val_levante_emv_com(crawler, out): urls = set() for url in crawler.fetch_sitemap('http://val.levante-emv.com/sitemap.xml'): url = url.replace('//www.levante-emv.com', '//val.levante-emv.com') if re.search(r'/\d{4}/\d{2}/\d{2}/', url) is not None: urls.add(url) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue es_url = url.replace('//val.levante-emv.com', '//www.levante-emv.com') html = doc.content.decode('utf-8') pubdate = re.search( r'<meta name="cXenseParse:recs:publishtime" content="([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None title = extract('<span itemprop="articleBody">', '</h1>', html) subtitle = extract('<h2 itemprop="description">', '</h2>', html) content = extract('<span itemprop="articleBody">', '</apertium-notrans>', html) paras = clean_paragraphs(''.join( ['<p>%s</p>' % p for p in (title, subtitle, content) if p])) text = '\n'.join(paras) for sep in ['Compartir en Twitter', 'HEMEROTECA\n', '\nPublicitat\n']: text = text.split(sep)[0].strip() if not text: continue if any(b in text for b in [ 'inicia sessió si eres subscriptor', 'Si eres subscriptor inicia sessió', 'Para continuar leyendo... suscríbete' ]): continue out.write('# Location: %s\n' % url) out.write('# Translation.es: %s\n' % es_url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(text + '\n')
def _crawl_visao_sapo_pt(crawler, out): sitemap = crawler.fetch_sitemap( 'http://visao.sapo.pt/sitemap/visao_index.xml') sitemap.update( crawler.fetch_sitemap('http://visao.sapo.pt/sitemap/visao_news.xml')) for url in sorted(sitemap): html = crawler.fetch_content(url) title = re.search( r'<meta name="twitter:title" property="og:title" ' r'content="([^"]+)"', html) title = title.group(1) if title else '' pubdate = re.search( r'<p class="timeStamp publishedDate" datetime="' r'([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None body = extract('<div class="afterHeader">', '<footer', html) or '' paras = clean_paragraphs('%s<p>%s' % (title, body)) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_yaikrb_ru(crawler, out): urls = set() # The site has an incomplete sitemap, so we also look at the archive pages. sitemap = crawler.fetch_sitemap('http://yaikrb.ru/sitemap.xml') for url in sorted(sitemap): crawler.fetch_content(url) main = crawler.fetch_content('http://yaikrb.ru/') archives = set([str(x) for x in range(1, 150)]) archives.update(re.findall(r'/xf/num/([^/]+)/', main)) for a in sorted(archives): doc = crawler.fetch(urlencode('http://yaikrb.ru/xf/num/%s/' % a)) if doc.status != 200: continue for href in re.findall(r'<div class="n_more"><a href="([^"]+)"', doc.content.decode('utf-8')): urls.add(urljoin('http://yaikrb.ru/', href, allow_fragments=False)) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = re.search('<meta property="og:title" content="([^"]+)"', html) title = title.group(1) if title else '' text = extract('<div class="n_text">', '<div class="n_oth">', html) paras = clean_paragraphs('<h1>' + title + '</h1>' + text) if not paras: continue pubdate = re.search( r'<small>(\d{1,2})\.(\d{1,2})\.(20\d{2})\s*</small></h1>', html) if pubdate is not None: pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2), pubdate.group(1)) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_than_lwin_times(crawler, out): urls = find_wordpress_urls(crawler, 'http://thanlwintimes.com/') for url in sorted(urls): if not url.endswith('/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search( r'<time class="entry-date updated td-module-date" ' r'datetime="([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else '' title = (extract('<title>', '</title>', html) or '').split('|')[0] body = extract('<div class="td-post-content">', "<div class='sfsi_Sicons'", html) or '' body = body.split('Please follow and like us')[0] paragraphs = clean_paragraphs('%s<br/>%s' % (title, body)) if len(paragraphs) > 0: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paragraphs) + '\n')
def _crawl_dnevnik_si(crawler, out): urls = set() for url in crawler.fetch_sitemap('https://www.dnevnik.si/sitemap'): match = re.search(r'#(\d+)$', url) if not match: match = re.search(r'dnevnik\.si/(\d+)', url) if match: urls.add('https://www.dnevnik.si/' + match.group(1)) for url in sorted(urls): doc = crawler.fetch_content(url) title = re.search(r'<meta name="og:title" content="(.+?)"', doc) title = title.group(1).replace('&', '&') if title else '' pubdate = re.search(r'<div class="dtstamp" title="(.+?)">', doc) pubdate = pubdate.group(1).strip() if pubdate else None text = extract('<div class="article-body article-wrap">', '<div class="article-tags">', doc) or '' paras = clean_paragraphs(title + '<br/>' + text.replace('\r', '\n')) if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_svaboda_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.svaboda.org/sitemap.xml') for url in sorted(sitemap): if (url == 'https://www.svaboda.org/' or url.startswith('https://www.svaboda.org/z/')): # index pages continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = extract('<title>', '</title>', html) or '' pubdate = re.search( r'<div class="published">\s*<span class="date"\s*>' r'\s*<time datetime="([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None body = extract('<div class="body-container">', '<div id="comments"', html) or '' paras = clean_paragraphs('%s<p/>%s' % (title, body)) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')