def crawl_voice_of_nigeria(crawler, out, urlprefix): assert urlprefix.startswith('/'), urlprefix assert urlprefix.endswith('/'), urlprefix site = urljoin('http://von.gov.ng/', urlprefix) for url in sorted(find_wordpress_urls(crawler, site)): doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = re.search(r'<h1[^>]*>(.+?)</h1>', html, re.DOTALL).group(1) pubdate = re.search(r'<meta itemprop="dateModified" content="(.+?)"', html) if pubdate is None: # only a few pages with little content continue pubdate = cleantext(pubdate.group(1)) content = re.split('<p[^>]*>', html, 1)[1].split('<footer', 1)[0] content = content.replace('\n', ' ').replace('</p>', '\n') paras = [title] + content.splitlines() paras = filter(None, [cleantext(p) for p in paras]) if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def find_wordpress_urls(crawler, site): urls = set() mainpage = crawler.fetch(site).content.decode('utf-8') for category in re.findall(r'/(category/[^/"]+/)">', mainpage): caturl = urljoin(site, category) catdoc = crawler.fetch(caturl) assert catdoc.status == 200, (catdoc.status, caturl) pages = [int(n) for n in re.findall(r'/page/(\d)+/', catdoc.content)] for page in range(1, 1 + max([0] + pages)): pgurl = urljoin(caturl, 'page/%d/' % page) if page > 1 else caturl pgdoc = crawler.fetch(pgurl) assert pgdoc.status == 200, (pgdoc.status, pgurl) pgcontent = pgdoc.content.decode('utf-8') for url in re.findall(r'"(%s[^/"]+/)"' % site, pgcontent): if not url.endswith('/feed/'): urls.add(url) return urls
def find_wordpress_urls(crawler, site): urls = set() mainpage = crawler.fetch_content(site) for category in re.findall(r'/(category/[^/"]+/)">', mainpage): caturl = urljoin(site, category) catdoc = crawler.fetch(caturl) assert catdoc.status == 200, (catdoc.status, caturl) pages = [int(n) for n in re.findall(r'/page/(\d)+/', catdoc.content)] for page in range(1, 1 + max([0] + pages)): pgurl = urljoin(caturl, 'page/%d/' % page) if page > 1 else caturl pgdoc = crawler.fetch(pgurl) if pgdoc.status != 200: print('Error %3d: %s' % (pgdoc.status, pgurl)) continue pgcontent = pgdoc.content.decode('utf-8') for url in re.findall(r'"(%s[^"]+)"' % site, pgcontent): url = replace_html_entities(url.split('#')[0]) if url.find('/category/') < 0 and not url.endswith('/feed/'): urls.add(url) return urls
def crawl_voice_of_america(self, out, host, ignore_ascii=False): site = 'https://%s' % host sitemap = self.fetch_sitemap(urljoin(site, 'sitemap.xml')) for url in sorted(sitemap.keys()): doc = self.fetch(url) if doc.status != 200: continue try: html = doc.content.decode('utf-8') except UnicodeDecodeError: continue title = re.search(r'<title>(.+?)</title>', html) title = title.group(1) if title else '' pubdate = re.search( r'<div class="published">\s*<span class="date"\s*>\s*' r'<time datetime="(.+?)"', html) pubdate = cleantext(pubdate.group(1)) if pubdate else '' if pubdate.startswith('1900'): pubdate = '' description = re.search( r'<meta name="description" content="(.+?)"', html) description = description.group(1) if description else '' if description == title: description = '' paragraphs = [title, description] if html.find('<div class="intro content-offset">') > 0: intro = html.split('<div class="intro content-offset">', 1)[1] intro = intro.split('</div')[0] intro = intro.replace('</p>', '\n').replace('</P>', '\n') paragraphs.extend(intro.splitlines()) if html.find('<div class="wsw">') > 0: content = html.split('<div class="wsw">', 1)[1] content = content.split('<div')[0] content = content.replace('</p>', '\n').replace('</P>', '\n') paragraphs.extend(content.splitlines()) paragraphs = filter(None, [cleantext(p) for p in paragraphs]) paragraphs = [p for p in paragraphs if not p.startswith('VOA')] if ignore_ascii: paragraphs = [ p for p in paragraphs if not (ord(p[0]) >= 0x30 and ord(p[0]) <= 0xff) ] if len(paragraphs) > 0: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paragraphs) + '\n')
def urlallowed(url): if CONFIG['skip-robots-txt']: return True protocol, domain = urlparse.urlparse(url)[:2] for bd in ROBOTS_TXT_BLACKLIST_DOMAINS: if re.match(bd, domain): return True for d in ['sourceforge', 'berlios', 'github.com']: if d in domain: return True if protocol == 'ftp': return True baseurl = '%s://%s' % (protocol, domain) robotsurl = urlparse.urljoin(baseurl, 'robots.txt') if rpcache.has_key(baseurl): rp = rpcache[baseurl] else: from socket import setdefaulttimeout, getdefaulttimeout timeout = getdefaulttimeout() setdefaulttimeout(5) rp = robotparser.RobotFileParser() rp.set_url(robotsurl) try: rp.read() rpcache[baseurl] = rp except: rp = None setdefaulttimeout(timeout) return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
def urlallowed(url): if CONFIG['skip-robots-txt']: return True protocol, domain = urlparse.urlparse(url)[:2] for bd in ROBOTS_TXT_BLACKLIST_DOMAINS: if re.match(bd, domain): return True for d in ['sourceforge', 'berlios', 'github.com']: if d in domain: return True if protocol == 'ftp': return True baseurl = '%s://%s' % (protocol, domain) robotsurl = urlparse.urljoin(baseurl, 'robots.txt') if baseurl in rpcache: rp = rpcache[baseurl] else: from socket import setdefaulttimeout, getdefaulttimeout timeout = getdefaulttimeout() setdefaulttimeout(5) rp = robotparser.RobotFileParser() rp.set_url(robotsurl) try: rp.read() rpcache[baseurl] = rp except: rp = None setdefaulttimeout(timeout) return rp.can_fetch(CONFIG['user-agent'], url) if rp else True