def crawl_loksatta_com(crawler, out): sitemap = crawler.fetch_sitemap('http://www.loksatta.com/sitemap.xml') for url in sorted(sitemap): doc = crawler.fetch(url) if doc.status != 200: continue try: html = doc.content.decode('utf-8') except UnicodeDecodeError: continue pubdate = re.search( r'<meta itemprop="datePublished" content="(.+?)"', html) pubdate = cleantext(pubdate.group(1)) if pubdate else None headline = extract('<h1 itemprop="headline" id="headline">', '</h1>', html) synopsis = extract('<h2 itemprop="description" class="synopsis">', '</h2>', html) text = extract('itemprop="articleBody">', '<div', html) if not text: continue text = text.replace('\n', ' ') text = re.sub(r'</?(?:br|BR|p|P)\s*?/?>', '\n', text) paras = [headline, synopsis] + text.splitlines() paras = filter(None, [cleantext(p) for p in paras]) if paras: out.write('# Location: %s\n# Genre: News\n' % url) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_azattyk_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml') for url in sorted(sitemap.keys()): if not urlpath(url).startswith('/a/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search(r'"dateModified":"([^"]+)"', html) if pubdate is not None: pubdate = cleantext(pubdate.group(1)).replace(' ', 'T') title = extract('<title>', '</title>', html) text = extract('content-offset">', '</div>', html) if not title or not text: continue paras = [title] + re.sub(r'<br\s*?/?>', '\n', text).splitlines() paras = filter(None, [cleantext(p) for p in paras]) paras = [p for p in paras if not p.startswith('http')] if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def _crawl_observador_pt(crawler, out): urls = set() for author_page in sorted( re.findall( r'href="(https?://observador.pt/perfil/[a-zA-Z_\-0-9]+/)"', crawler.fetch_content('http://observador.pt/autores/'))): html = crawler.fetch_content(author_page) urls.update( re.findall( r'href="(https?://observador.pt/20\d{2}/\d{2}/\d{2}/[^"]+)"', html)) for url in sorted(urls): try: html = crawler.fetch_content(url) except UnicodeDecodeError: continue title = re.search(r'<meta property="og:title" content="([^"]+)"', html) title = title.group(1) or '' pubdate = re.search(r'"dateModified":"([^"]+)"', html) pubdate = pubdate.group(1) or None lead = extract('<div class="lead">', '</div>', html) or '' content = extract('<div class="content">', '<h1>', html) or '' text = '\n'.join(clean_paragraphs('<p>'.join([title, lead, content]))) text = text.split('\nContinuar a ler')[0] text = text.split('\nLer mais')[0] text = text.split('\nPartilhe')[0] text = text.split('\nComente')[0] if text: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(text) out.write('\n')
def _crawl_kauno_diena_lt(crawler, out): urls = {} for i in range(1, 6): url = 'http://kauno.diena.lt/sitemap/kd/sitemap%d.xml' % i urls.update(crawler.fetch_sitemap(url)) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue try: html = doc.content.decode('utf-8') except UnicodeDecodeError: continue title = extract('<h1 class="title" id="page-title">', '</h1>', html) title = cleantext(title if title else '') body = extract("<span itemprop='articleBody'>", '</div>', html) or '' paras = [] for p in clean_paragraphs('%s<br/>%s' % (title, body)): if 'MicrosoftInternetExplorer4' in p: break paras.append(p) pubdate = re.search( r'<span\s+property="dc:date\s+dc:created"\s+content="(20[^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_ainm_ie(crawler, out): links = set() for let in map(chr, range(65, 91)): idxres = crawler.fetch('https://www.ainm.ie/Abc.aspx?Letter=%s' % let) if idxres.status != 200: continue idxhtml = idxres.content.decode('utf-8') index = extract('<div id="pageContent" role="main">', '<!-- .contentWrapper-->', idxhtml) for link in re.findall(r'<a href="(Bio.aspx\?ID=[^"]+?)">', index): links.add('https://www.ainm.ie/%s' % link) for url in sorted(links): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html) title = title.group(1).split('|')[0] if title else '' body = extract('<div class="article">', '<!-- .contentWrapper-->', html) or '' body = body.split('<div id="machines"')[0] paras = clean_paragraphs(title + '<br/>' + body) pubdate = fetchresult.headers.get('Last-Modified') if paras: out.write('# Location: %s\n' % url) out.write('# Genre: Biography\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_iltirreno_gelocal_it(crawler, out): urls = set() for category in ('italia-mondo', 'focus/toscana-economia', 'empoli/cronaca', 'grosseto/cronaca', 'livorno/cronaca', 'livorno/dagli-enti', 'lucca/cronaca', 'pisa/cronaca', 'prato/cronaca', 'versilia/cronaca'): urls.update(_find_tirreno_urls(crawler, category)) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue content = doc.content.decode('utf-8') header = extract('<h1 itemprop="headline name">', '<span itemprop="author"', content) or '' body = extract('<span itemprop="articleBody" >', '©', content) or '' paras = clean_paragraphs('%s<p/>%s' % (header, body)) text = '\n'.join(paras) for sep in ('Tags\n', 'Redazione | Scriveteci', 'TrovaRistorante', '<a href="', 'I COMMENTI DEI LETTORI', '©RIPRODUZIONE'): text = text.split(sep)[0] paras = text.splitlines() pubdate = re.search( r'<time itemprop="datePublished" content="([^"]+)"', content) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_blogspot(crawler, out, host): sitemap = crawler.fetch_sitemap('https://%s/sitemap.xml' % host) pubdate_regex = re.compile( r"<abbr class='published' title='([^']*)'>[^<]*</abbr>") for url in sorted(sitemap.keys()): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[url] title = re.search(r"<meta content='([^']+)' property='og:title'/>", html) title = title.group(1) if title else '' post = extract("<div class='post-body entry-content'>", "<div class='post-footer'>", html) if post == None: post = extract("<div class='post-header'>", "<div class='post-footer'>", html) if post == None: post = extract('<div class="post-body">', '<p class="post-footer">', html) paras = clean_paragraphs(title + '<br/>' + post) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: Blog\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_tuairisc_ie(crawler, out): sitemap = crawler.fetch_sitemap('https://tuairisc.ie/sitemap.xml') pubdate_regex = re.compile( r'<time datetime="(20\d\d-\d\d-\d\d)\s+(\d\d:\d\d)" ' r'itemprop="datePublished">') for url in sorted(sitemap.keys()): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = extract('<h1 class="title article--full__title">', '</h1>', html) or '' pubdate_match = pubdate_regex.search(html) if pubdate_match: pubdate = '%sT%s:00Z' % (pubdate_match.group(1), pubdate_match.group(2)) else: pubdate = sitemap[url] body = extract( '<div class="article--full__content" itemprop="articleBody">', '</article>', html) if not body: continue paras = clean_paragraphs(title + '<p/>' + body) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_meoneile_ie(crawler, out): sitemap = crawler.fetch_sitemap('https://meoneile.ie/sitemap.xml') for url in sorted(sitemap.keys()): if url == 'https://meoneile.ie/': continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = extract(r'<title>', '</title>', html).strip() title = title.split('<')[0].strip() if title else '' video = re.search( r"<iframe.*src='(//player.vimeo.com/video/[0-9]+)[^>]*></iframe>", html) body = extract("<div class='article-content'>", '</article>', html) or '' byline = extract("<div class='byline'>", '</span>', html) or '' byline = _byline_to_pubdate(byline) if body.find('<strong>%s</strong>' % title) >= 0: title = '' paras = clean_paragraphs(title + '<br/>' + body) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if video: out.write('# Video: https:%s\n' % video.group(1)) if byline: out.write('# Publication-Date: %s\n' % byline) for para in paras: if para == 'Roinn': continue else: out.write(para + '\n')
def crawl_azattyk_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml') for url in sorted(sitemap.keys()): if not urlpath(url).startswith('/a/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search(r'"dateModified":"([^"]+)"', html) if pubdate is not None: pubdate = cleantext(pubdate.group(1)).replace(' ', 'T') title = extract('<title>', '</title>', html) text = extract('content-offset">', '<footer', html) if not title or not text: continue text = text.split('<span class="share')[0] text = text.split('<div class="region"')[0] text = text.replace('\n', ' ') paras = [title] + re.sub(r'<(?:br|p|div)\s*?/?>', '\n', text).splitlines() paras = filter(None, [cleantext(p.strip()) for p in paras]) paras = [p for p in paras if not p.startswith('http')] if not paras: continue # Filter out English text. if ord(paras[0][0]) <= 0xFF or ord(paras[-1][-1]) <= 0xFF: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def _crawl_mwnation_com(crawler, out): urls = set() index = crawler.fetch_content('http://mwnation.com/section/chichewa/') pages = re.findall(r'/section/chichewa/page/(\d+)/', index) num_pages = max([int(p) for p in pages]) for page in range (1, num_pages + 1): url = 'http://mwnation.com/section/chichewa/' if page > 1: url += 'page/%d/' % page doc = crawler.fetch_content(url) urls.update(re.findall(r'<a href="([^"]+?)">Continue Reading', doc)) for url in sorted(urls): doc = crawler.fetch_content(url) pubdate = re.search( r'<meta property="article:published_time" content="([^"]+)"', doc) pubdate = pubdate.group(1) if pubdate is not None else None title = extract('<h1 class="entry-title" itemprop="headline">', '</h1>', doc) or '' body = extract('<div class="entry-content" itemprop="articleBody">', '<footer ', doc) or '' paras = clean_paragraphs(title + '<br/>' + body) text = '\n'.join(paras) + '\n' if text.find(' the ') >= 0: # likely English continue if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(text)
def crawl_wantokniuspepa_com(crawler, out): sections = { 'abc-pasifik-nius', 'bisnis-nius', 'helt-nius', 'komentri', 'laip-stail', 'meri-nius', 'nius', 'wantok' } seeds = set() for section in sorted(sections): section_url = 'http://wantokniuspepa.com/index.php/%s' % section seeds.add(section_url) section_index = crawler.fetch(section_url) assert section_index.status == 200, (section_index.status, section_url) last_page = re.search('"End" href=".+?start=(\d+)" class="pagenav"', section_index.content.decode('utf-8')) if last_page is not None: for page in range(1, int(last_page.group(1)) + 1): seeds.add('http://wantokniuspepa.com/index.php/%s?start=%d' % (section, page)) urls = set() for seed in sorted(seeds): doc = crawler.fetch(seed) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') for u in re.findall(r'(/index\.php/[^"]+?)"', content): p = u.split('/') if len(p) > 3 and p[1] == 'index.php' and p[2] in sections: if re.search(r'/\d{4,}', u) is not None: urls.add('http://wantokniuspepa.com' + u.split('?')[0]) for url in sorted(urls): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') title = extract('<title>', '</title>', content) pubdate = re.search( r'<time datetime="([^T]+?)T([^"]+?)" ' 'itemprop="datePublished">', content) pubdate = cleantext(pubdate.group(1)) if pubdate else None body = extract('<div itemprop="articleBody">', '<ul class="pager', content) if not body: continue body = body.split('<div class="clearfix"')[0] text = body.replace('\n', ' ') text = text.replace(' ,', ',').replace('“ ', '“') text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text) text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text) paras = [cleantext(p) for p in [title] + text.splitlines()] paras = filter(None, paras) if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_pl_usembassy_gov(crawler, out): sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml') trans_regex = re.compile( r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"' ) pubdate_regex = re.compile( r'<meta property="article:published_time" content="([^"]*)"' ) links = set() for key in sorted(sitemap.keys()): if _pl_usembassy_gov_path(key): links.add(key) for link in sorted(links): result = crawler.fetch(link) if result.status != 200: continue html = result.content.decode('utf-8') title = extract('<title>', '</title>', html) title = title if title else '' title = title.split(' | ')[0] if ' | ' in title else title pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None trans_match = trans_regex.search(html) trans = trans_match.group(1) if trans_match else None if pubdate is None: pubdate = result.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[link] exstart = '<div class="entry-content">' exstart2 = '<div class="mo-page-content">' exend = '<!-- AddThis Advanced Settings above via filter on the_content -->' exstart = exstart2 if exstart2 in html else exstart content = extract(exstart, exend, html) cleanparas = clean_paragraphs(content) if content else None # Don't repeat the title if it's the only text content cleantitle = cleantext(title) if cleanparas: if len(cleanparas) == 1 and cleanparas[0] == cleantitle: paras = [cleantitle] else: paras = [cleantitle] + cleanparas else: paras = [cleantitle] # There are quite a few media pages whose only text is the filename # this, conveniently, is typically also the post's name if len(paras) == 1 and paras[0].lower() in urlpath(link).lower(): continue if paras: out.write('# Location: %s\n' % link) out.write('# Genre: Diplomatic\n') if trans: out.write('# Translation: %s\n' % trans) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_larenadomila_it(crawler): out = crawler.get_output(language='vec-u-sd-itvr') urls = find_urls_in_larenadomila_it( crawler, 'https://www.larenadomila.it/sito/index.php') for url in sorted(urls.difference(BLACKLISTED_URLS)): if url.find('&view=article&') < 0: continue doc = crawler.fetch(url) assert doc.status == 200, (doc.status, start_url) content = doc.content.decode('utf-8') title = cleantext(extract('<title>', '</title>', content)) sections = [title] + [c.strip() for c in content.splitlines()] sections = [c for c in sections if c.startswith('<div class="item_fulltext">') or c.startswith('<p><span class="grassetto">')] sections = [c.replace(' <br />- ', ' ') for c in sections] text = '<br/>'.join(sections) text = text.replace(' ', ' ') # used for spacing/formatting text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table)>', '\n', text) text = re.sub(r'<br\s*/?>', '\n', text) text = re.sub(r'\.{3,}', '… ', text) text = re.sub(r'\n(-)[^\s]', '- ', text) paras = filter(None, [cleantext(p) for p in text.split('\n')]) if not paras: continue out.write('# Location: %s\n' % url) out.write('\n'.join(paras) + '\n')
def _crawl_vatantat_ru(crawler, out): index = crawler.fetch_content('http://www.vatantat.ru/') last = max([int(p) for p in re.findall(r'index\.php\?pg=(\d+?)"', index)]) for page in range(2, last + 1): url = 'http://www.vatantat.ru/index.php?pg=%d' % page doc = crawler.fetch(url) if doc.status != 200: continue content = doc.content.decode('utf-8') html = extract('<p><span style="font-size: large;"><strong>', '<span style="font-size: 80%; font-weight: bold;">', content) if not html: continue html = html.split('(“Ватаным Татарстан”,')[0] html = html.split('<script>')[0] paras = clean_paragraphs(html) if not paras: continue pubdate = re.search( r'Татарстан”, /№ (none|\d+), ' r'(\d\d)\.(\d\d)\.(20\d\d)/', content) if pubdate is not None: pubdate = ('%s-%s-%s' % (pubdate.group(4), pubdate.group(3), pubdate.group(2))) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_kyym_ru(crawler, out): index = crawler.fetch_content('http://www.kyym.ru/') last = max([ int(s) for s in re.findall(r'href="/index\.php\?start=(\d+?)"', index) ]) urls = set() for page in range(1, last + 1): doc = crawler.fetch_content('http://www.kyym.ru/index.php?start=%d' % page) for path in re.findall(r'<a href="(/index\.php\?view=article&[^"]+?)"', doc): urls.add('http://www.kyym.ru' + path.replace('&', '&')) for url in sorted(urls): doc = crawler.fetch_content(url) html = extract('<div class="news_item_article">', '<!--end news item -->', doc) if not html: continue paras = clean_paragraphs(html) if not paras: continue pubdate = re.search( r'<span class="createdate"><!-- date and by -->' r'\s*(\d{1,2}).(\d{2}).(20\d{2})', doc, flags=re.DOTALL) if pubdate is not None: pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2), pubdate.group(1)) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_telegraaf_nl(crawler, out): sitemap = crawler.fetch_sitemap( 'http://www.telegraaf.nl/sitemap.xml', subsitemap_filter=_should_fetch_telegraaf_sitemap) for url in sorted(sitemap): doc = crawler.fetch(urlencode(url)) if doc.status != 200: continue html = doc.content.decode('utf-8') title = re.search( r'<meta [a-zA-Z\-="]* property="og:title" content="(.+?)"', html) title = title.group(1) if title else '' pubdate = re.search(r'"publishDate":"([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None text = extract( 'data-element="ArticlePage-intro">', '<div class="flex" data-element="ArticlePage-socialShare-root">', html) or '' paras = clean_paragraphs(title + '<br/>' + text) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_azg_am(crawler, out): urls = set() for d in daterange(date(2001, 1, 9), date.today()): datestr = '%04d%02d%02d00' % (d.year, d.month, d.day) url = 'http://www.azg.am/AM/%s' % datestr doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') articles = [ a for a in re.findall(r'20\d{8}', content) if not a.endswith('00') ] for a in articles: urls.add('http://www.azg.am/wap/?nl=AM&id=%s&Base_PUB=0' % a) print(len(urls)) for url in sorted(urls): pubdate = re.search(r'id=(20\d{6})', url).group(1) doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') text = extract('<hr>', '<hr>', content) text = text.replace('\n', ' ') text = re.sub('</(p|h[1-9]|div)>', '\n', text) paras = filter(None, [cleantext(p) for p in text.splitlines()]) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Publication-Date: %s-%s-%s\n' % (pubdate[:4], pubdate[4:6], pubdate[6:8])) out.write('\n'.join(paras) + '\n')
def _crawl_jornalet_com(crawler, out): for url in sorted(_find_urls_jornalet_com(crawler)): try: html = crawler.fetch_content(url) except UnicodeDecodeError: continue title = re.search(r'<meta property="og:title" content="([^"]+)"', html) title = title.group(1) if title else '' subtitle = extract('<h4 class="subtitol">', '</h4>', html) or '' content = extract('<p class="contingut">', '<hr', html) or '' paras = clean_paragraphs('\n'.join( ['<p>%s</p>' % p for p in (title, subtitle, content) if p])) paras = [p for p in paras if p.find('Abonar los amics de Jornalet') < 0] if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('\n'.join(paras) + '\n')
def crawl_coislife_ie(crawler, out): links = set() for num in range(1, 12): if num > 1: listurl = 'https://www.coislife.ie/product-category/ga/page/%s/' % num else: listurl = 'https://www.coislife.ie/product-category/ga/' idxres = crawler.fetch(listurl) if idxres.status != 200: continue idxhtml = idxres.content.decode('utf-8') index = extract('<div class="products-archive--products">', '<nav class="woocommerce-pagination">', idxhtml) for link in re.findall( r'<a href="(https://www.coislife.ie/product/[^"]+?)">', index): links.add(link) for url in sorted(links): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html) title = title.group(1).split('–')[0].strip() if title else '' desc = re.search(r'<meta property="og:description" content="([^"]+?)"', html) desc = cleantext(desc.group(1)) body = extract( '<div class="tab-content">', '<div class="entry-content in fade tab-pane" id="tab-additional_information">', html) or '' paras = clean_paragraphs(title + '<br/>' + body) pubdate = fetchresult.headers.get('Last-Modified') if paras: out.write('# Location: %s\n' % url) out.write('# Genre: Commerce\n') if desc: out.write('# Description: %s\n' % desc) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for para in paras: if para.find('Léigh sliocht as an leabhar') >= 0: continue else: out.write(para + '\n')
def crawl_jagbani_punjabkesari_in(crawler, out): urls = set() main = crawler.fetch('http://jagbani.punjabkesari.in/') assert main.status == 200, main.status menu = extract('<nav id="menu" class="menu">', '</nav>', main.content.decode('utf-8')) urls_re = re.compile(r'href="(https?://jagbani\.punjabkesari\.in/[^"]+?)"') category_urls = urls_re.findall(menu) for category_url in sorted(set([x.strip() for x in category_urls])): for page in range(1, 1000): doc = crawler.fetch(category_url + '/page/%d' % page) content = doc.content.decode('utf-8') if doc.status == 200 else '' if content.find('class="story"') < 0: break for u in urls_re.findall( extract('<span class="story">', '<div class="kjpage"', content)): urls.add(urlencode(u.strip())) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue try: content = doc.content.decode('utf-8') except UnicodeDecodeError: continue title = extract('<title>', '</title>', content) text = extract('<article>', '</article>', content) if not text: continue text = re.sub(r'<br[^a-zA-Z][^>]*>', '<br>', text) text = text.replace('\n', ' ').replace('<br>', '\n') paras = [title] + text.splitlines() paras = filter(None, [cleantext(p) for p in paras]) pubdate = re.search( '<meta property="article:published_time" content="([^"]+?)"', content) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(('\n'.join(paras) + '\n'))
def crawl_peig_ie(crawler, out): crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)) sitemap = crawler.fetch_sitemap('https://peig.ie/sitemap_index.xml', subsitemap_filter=_peig_filter_robots) def peig_cat(page): if page.find('/imeachtai/') >= 0: return 'Events' elif page.find('peig.ie/20') >= 0: return 'News' elif page.find('/fol%C3%BAntais/') >= 0: return 'Job listings' else: return '' # Peig.ie has a lot of posts from other sites def skip_page(site): if site.find('//nos.ie/') >= 0: return True elif site.find('//tuairisc.ie/') >= 0: return True elif site.find('//meoneile.ie/') >= 0: return True else: return False for url in sorted(sitemap.keys()): if url == 'https://peig.ie/': continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html) title = title.group(1).split('|')[0].strip() if title else '' read_more = re.search(r'<a.*href="([^"]+")[^>]*>Níos mó</a>', html) if read_more and skip_page(read_more.group(1)): continue if '<meta property="article:modified_time"' in html: date = re.search( r'<meta property="article:modified_time" content="([^"]+)"', html).group(1) else: date = re.search(r'"datePublished":"([^"]+)"', html).group(1) body = extract('<div class="uk-margin-medium-top" property="text">', '<ul class="uk-pagination', html) or '' paras = clean_paragraphs(title + '<br/>' + body) genre = peig_cat(url) if paras: out.write('# Location: %s\n' % url) if genre: out.write('# Genre: %s\n' % genre) if date: out.write('# Publication-Date: %s\n' % date) out.write('\n'.join(paras) + '\n') crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1))
def crawl_wikisource_trieste_vernacola(crawler): out = crawler.get_output(language='vec-u-sd-itts') urls = set() index = crawler.fetch( 'https://vec.wikisource.org/wiki/Indice:Trieste_vernacola.djvu') assert index.status == 200, index.status remarks = extract('<div id="remarks">', 'Colombe</a>', index.content.decode('utf-8')) for urlpath in sorted(set(re.findall(r'href="(/wiki/[^"]+)"', remarks))): if not urlpath.startswith('/wiki/Trieste_vernacola/'): urls.add('https://vec.wikisource.org' + urlpath) for url in sorted(urls.difference(BLACKLISTED_URLS)): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') text = extract('<div id="scatola" class="testo">', '<noscript>', content) text = text.split('<dt>Note</dt>')[0].split('<dl>')[0] text = text.replace('\n', ' ') text = re.sub(r'<sup.+?</sup>', '', text) text = text.replace(' ', ' ') # NBSP used for spacing text = text.replace("'", "’") text = re.sub(r'<!--.+?-->', '', text, flags=re.DOTALL) text = re.sub(r' alt="[^"]+"', ' ', text, flags=re.DOTALL) text = re.sub(r'<span class="numeroriga".+?</span>', '', text) text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text) text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text) lines = [l for l in text.splitlines() if l.find('noprint') < 0 and l.find('font-size:smaller') < 0] text = '\n'.join([cleantext(l) for l in lines]) text = re.sub('\n{2,}', '<p>', text).replace('\n', ' | ') text = text.replace('<p>', '\n') paras = filter(None, [' '.join(p.split()) for p in text.splitlines()]) if not paras: continue # The book, published in 1920, is a collection of earlier lyrics. pubyear = re.search(r'<span id="ws-year">(\d{4})</span>', content) pubyear = int(pubyear.group(1)) if pubyear else 1920 out.write('# Location: %s\n' % url) out.write('# Genre: Lyrics\n') out.write('# Publication-Date: %d\n' % pubyear) out.write('\n'.join(paras) + '\n')
def _crawl_asymptotejournal_com(crawler, out): url = ('https://www.asymptotejournal.com/nonfiction/' 'shibu-tudu-memories-of-the-kirta-dangra/santhali/') html = crawler.fetch_content(url) content = extract('<!-- article content -->', '<img src="/images/end-logo-black.gif"', html) out.write('# Location: %s\n' % url) out.write('# Genre: Fiction\n') paras = clean_paragraphs(content) paras = [p for p in paras if p[0] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'] out.write('\n'.join(paras) + '\n')
def crawl_forasnagaeilge_ie(crawler, out): sitemap = crawler.fetch_sitemap( 'https://www.forasnagaeilge.ie/sitemap_index.xml') pubdate_regex = re.compile(r'"datePublished":"([^"]+)",') for url in sorted(sitemap.keys()): orig_url = url if '?lang=en' in url: ga_url = url.replace('?lang=en', '') if ga_url in sitemap.keys(): continue if '/blog-en/' in url: continue if '/corporate-information/' in url: continue if '/torthai-cuardaigh/' in url: continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') if '<html class="no-js" lang="en">' in html: continue title = extract('<title>', ' - www.forasnagaeilge.ie</title>', html) or '' pubdate_match = pubdate_regex.search(html) if pubdate_match: pubdate = pubdate_match.group(1) else: pubdate = sitemap.get(url) or sitemap[orig_url] body = extract('<div id="main" class="container">', '</div><!-- /.content -->', html) if not body: continue paras = clean_paragraphs(body) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Title: %s\n' % title) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_raestdzinad_ru(crawler, out): urls = crawler.fetch_sitemap( urlencode('https://растдзинад.рф/sitemap_index.xml')) for url in sorted(urls): if re.search(r'/20\d{2}/', url) is None: continue html = crawler.fetch_content(url) title = extract('<h1 class="entry-title">', '</h1>', html) or '' text = extract('<div class="td-post-content">', '<footer>', html) or '' text = text.split('<div class = "evc-social-likes"')[0] pubdate = re.search( r'<meta property="article:published_time" content="([^"]+)"', html) if pubdate: pubdate = pubdate.group(1) paras = clean_paragraphs('%s<p/>%s' % (title, text)) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_eestikirik_ee(crawler, out): for url in sorted(_find_urls_eestikirik_ee(crawler)): doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = extract('<h1 class="entry_title">', '</h1>', html) or '' entry = extract('<div class="entry">', '<div style="min-height:33px;"', html) or '' pubdate = re.search('(\d{1,2})\.(\d{1,2})\.(20\d{2})', extract('<div id="content">', '</small>', html)) if pubdate is not None: pubdate = '%04d-%02d-%02d' % (int( pubdate.group(3)), int(pubdate.group(2)), int( pubdate.group(1))) paras = clean_paragraphs('%s<br/>%s' % (title, entry)) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_val_levante_emv_com(crawler, out): urls = set() for url in crawler.fetch_sitemap('http://val.levante-emv.com/sitemap.xml'): url = url.replace('//www.levante-emv.com', '//val.levante-emv.com') if re.search(r'/\d{4}/\d{2}/\d{2}/', url) is not None: urls.add(url) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue es_url = url.replace('//val.levante-emv.com', '//www.levante-emv.com') html = doc.content.decode('utf-8') pubdate = re.search( r'<meta name="cXenseParse:recs:publishtime" content="([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None title = extract('<span itemprop="articleBody">', '</h1>', html) subtitle = extract('<h2 itemprop="description">', '</h2>', html) content = extract('<span itemprop="articleBody">', '</apertium-notrans>', html) paras = clean_paragraphs(''.join( ['<p>%s</p>' % p for p in (title, subtitle, content) if p])) text = '\n'.join(paras) for sep in ['Compartir en Twitter', 'HEMEROTECA\n', '\nPublicitat\n']: text = text.split(sep)[0].strip() if not text: continue if any(b in text for b in [ 'inicia sessió si eres subscriptor', 'Si eres subscriptor inicia sessió', 'Para continuar leyendo... suscríbete' ]): continue out.write('# Location: %s\n' % url) out.write('# Translation.es: %s\n' % es_url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(text + '\n')
def _crawl_than_lwin_times(crawler, out): urls = find_wordpress_urls(crawler, 'http://thanlwintimes.com/') for url in sorted(urls): if not url.endswith('/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search( r'<time class="entry-date updated td-module-date" ' r'datetime="([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else '' title = (extract('<title>', '</title>', html) or '').split('|')[0] body = extract('<div class="td-post-content">', "<div class='sfsi_Sicons'", html) or '' body = body.split('Please follow and like us')[0] paragraphs = clean_paragraphs('%s<br/>%s' % (title, body)) if len(paragraphs) > 0: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paragraphs) + '\n')
def _crawl_svaboda_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.svaboda.org/sitemap.xml') for url in sorted(sitemap): if (url == 'https://www.svaboda.org/' or url.startswith('https://www.svaboda.org/z/')): # index pages continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = extract('<title>', '</title>', html) or '' pubdate = re.search( r'<div class="published">\s*<span class="date"\s*>' r'\s*<time datetime="([^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None body = extract('<div class="body-container">', '<div id="comments"', html) or '' paras = clean_paragraphs('%s<p/>%s' % (title, body)) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')