Exemple #1
0
def crawl_voice_of_nigeria(crawler, out, urlprefix):
    assert urlprefix.startswith('/'), urlprefix
    assert urlprefix.endswith('/'), urlprefix
    site = urljoin('http://von.gov.ng/', urlprefix)
    for url in sorted(find_wordpress_urls(crawler, site)):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = re.search(r'<h1[^>]*>(.+?)</h1>', html, re.DOTALL).group(1)
        pubdate = re.search(r'<meta itemprop="dateModified" content="(.+?)"',
                            html)
        if pubdate is None:  # only a few pages with little content
            continue
        pubdate = cleantext(pubdate.group(1))
        content = re.split('<p[^>]*>', html, 1)[1].split('<footer', 1)[0]
        content = content.replace('\n', ' ').replace('</p>', '\n')
        paras = [title] + content.splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')
Exemple #2
0
def find_wordpress_urls(crawler, site):
    urls = set()
    mainpage = crawler.fetch(site).content.decode('utf-8')
    for category in re.findall(r'/(category/[^/"]+/)">', mainpage):
        caturl = urljoin(site, category)
        catdoc = crawler.fetch(caturl)
        assert catdoc.status == 200, (catdoc.status, caturl)
        pages = [int(n) for n in re.findall(r'/page/(\d)+/', catdoc.content)]
        for page in range(1, 1 + max([0] + pages)):
            pgurl = urljoin(caturl, 'page/%d/' % page) if page > 1 else caturl
            pgdoc = crawler.fetch(pgurl)
            assert pgdoc.status == 200, (pgdoc.status, pgurl)
            pgcontent = pgdoc.content.decode('utf-8')
            for url in re.findall(r'"(%s[^/"]+/)"' % site, pgcontent):
                if not url.endswith('/feed/'):
                    urls.add(url)
    return urls
Exemple #3
0
def find_wordpress_urls(crawler, site):
    urls = set()
    mainpage = crawler.fetch_content(site)
    for category in re.findall(r'/(category/[^/"]+/)">', mainpage):
        caturl = urljoin(site, category)
        catdoc = crawler.fetch(caturl)
        assert catdoc.status == 200, (catdoc.status, caturl)
        pages = [int(n) for n in re.findall(r'/page/(\d)+/', catdoc.content)]
        for page in range(1, 1 + max([0] + pages)):
            pgurl = urljoin(caturl, 'page/%d/' % page) if page > 1 else caturl
            pgdoc = crawler.fetch(pgurl)
            if pgdoc.status != 200:
                print('Error %3d:      %s' % (pgdoc.status, pgurl))
                continue
            pgcontent = pgdoc.content.decode('utf-8')
            for url in re.findall(r'"(%s[^"]+)"' % site, pgcontent):
                url = replace_html_entities(url.split('#')[0])
                if url.find('/category/') < 0 and not url.endswith('/feed/'):
                    urls.add(url)
    return urls
Exemple #4
0
 def crawl_voice_of_america(self, out, host, ignore_ascii=False):
     site = 'https://%s' % host
     sitemap = self.fetch_sitemap(urljoin(site, 'sitemap.xml'))
     for url in sorted(sitemap.keys()):
         doc = self.fetch(url)
         if doc.status != 200:
             continue
         try:
             html = doc.content.decode('utf-8')
         except UnicodeDecodeError:
             continue
         title = re.search(r'<title>(.+?)</title>', html)
         title = title.group(1) if title else ''
         pubdate = re.search(
             r'<div class="published">\s*<span class="date"\s*>\s*'
             r'<time datetime="(.+?)"', html)
         pubdate = cleantext(pubdate.group(1)) if pubdate else ''
         if pubdate.startswith('1900'):
             pubdate = ''
         description = re.search(
             r'<meta name="description" content="(.+?)"', html)
         description = description.group(1) if description else ''
         if description == title:
             description = ''
         paragraphs = [title, description]
         if html.find('<div class="intro content-offset">') > 0:
             intro = html.split('<div class="intro content-offset">', 1)[1]
             intro = intro.split('</div')[0]
             intro = intro.replace('</p>', '\n').replace('</P>', '\n')
             paragraphs.extend(intro.splitlines())
         if html.find('<div class="wsw">') > 0:
             content = html.split('<div class="wsw">', 1)[1]
             content = content.split('<div')[0]
             content = content.replace('</p>', '\n').replace('</P>', '\n')
             paragraphs.extend(content.splitlines())
         paragraphs = filter(None, [cleantext(p) for p in paragraphs])
         paragraphs = [p for p in paragraphs if not p.startswith('VOA')]
         if ignore_ascii:
             paragraphs = [
                 p for p in paragraphs
                 if not (ord(p[0]) >= 0x30 and ord(p[0]) <= 0xff)
             ]
         if len(paragraphs) > 0:
             out.write('# Location: %s\n' % url)
             out.write('# Genre: News\n')
             if pubdate:
                 out.write('# Publication-Date: %s\n' % pubdate)
             out.write('\n'.join(paragraphs) + '\n')
Exemple #5
0
def urlallowed(url):
    if CONFIG['skip-robots-txt']:
        return True

    protocol, domain = urlparse.urlparse(url)[:2]

    for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
        if re.match(bd, domain):
            return True

    for d in ['sourceforge', 'berlios', 'github.com']:
        if d in domain:
            return True

    if protocol == 'ftp':
        return True

    baseurl = '%s://%s' % (protocol, domain)
    robotsurl = urlparse.urljoin(baseurl, 'robots.txt')

    if rpcache.has_key(baseurl):
        rp = rpcache[baseurl]
    else:
        from socket import setdefaulttimeout, getdefaulttimeout

        timeout = getdefaulttimeout()
        setdefaulttimeout(5)

        rp = robotparser.RobotFileParser()
        rp.set_url(robotsurl)
        try:
            rp.read()
            rpcache[baseurl] = rp
        except:
            rp = None

        setdefaulttimeout(timeout)

    return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
Exemple #6
0
def urlallowed(url):
    if CONFIG['skip-robots-txt']:
        return True

    protocol, domain = urlparse.urlparse(url)[:2]

    for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
        if re.match(bd, domain):
            return True

    for d in ['sourceforge', 'berlios', 'github.com']:
        if d in domain:
            return True

    if protocol == 'ftp':
        return True

    baseurl = '%s://%s' % (protocol, domain)
    robotsurl = urlparse.urljoin(baseurl, 'robots.txt')

    if baseurl in rpcache:
        rp = rpcache[baseurl]
    else:
        from socket import setdefaulttimeout, getdefaulttimeout

        timeout = getdefaulttimeout()
        setdefaulttimeout(5)

        rp = robotparser.RobotFileParser()
        rp.set_url(robotsurl)
        try:
            rp.read()
            rpcache[baseurl] = rp
        except:
            rp = None

        setdefaulttimeout(timeout)

    return rp.can_fetch(CONFIG['user-agent'], url) if rp else True