Python urljoin Exemples, corpuscrawler.util.urljoin Python Exemples

Exemple #1

0

Afficher le fichier

def crawl_gsw_seislerblog(crawler):
    urls = set()
    for i in range(1, 16):
        indexurl = ('http://www.freiburger-nachrichten.ch/blogs/seislerblog'
                    '?page=%d' % i)
        html = crawler.fetch(indexurl).content.decode('utf-8')
        for url in re.findall(r'<a href="(/blogs/seislerblog/.+?)[\s"]', html):
            urls.add(urljoin(indexurl, url))
    out = crawler.get_output('gsw-u-sd-chfr')
    for url in sorted(urls):
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Blog\n')
        text = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(
            r'<span class="date-created">([0-9]{1,2})\.([0-9]{2})\.'
            '(20[0-9]{2})</span>', text)
        if pubdate != None:
            day, month, year = pubdate.groups()
            pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day))
            out.write('# Publication-Date: %s\n' % pubdate)
        text = text.split('<h1>', 1)[-1].split('<section')[0]
        text = text.replace('\n', ' ')
        for tag in ('</p>', '</h1>', '</div>'):
            text = text.replace(tag, '\n')
        for p in [
                ' '.join(striptags(t).strip().split())
                for t in text.splitlines()
        ]:
            if p and p != 'Kommentare':
                out.write(p + '\n')

Exemple #2

0

Afficher le fichier

def crawl_gsw_derbund(crawler):
    urls = set()
    for i in range(1, 200):
        url = ('https://www.derbund.ch/ajax/tags.html?'
               'action=moreDossierStories&section_id=11127&page=%d'
               '&dossier_id=3069' % i)
        items = json.loads(crawler.fetch(url).content)['items']
        for path in re.findall(r'<a href="(.+?)"', ''.join(items)):
            if not path.startswith('/stichwort/autor/'):
                urls.add(urljoin('https://www.derbund.ch/', path))
        if len(items) == 0:
            break
    out = crawler.get_output('gsw-u-sd-chbe')
    for url in sorted(urls):
        text = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(r'Erstellt: ([0-9]{1,2})\.([0-9]{2})\.([0-9]{4})',
                            text)
        if pubdate is not None:
            day, month, year = pubdate.groups()
            pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Blog\n')
        if pubdate is not None:
            out.write('# Publication-Date: %s\n' % pubdate)
        text = text.split('<div id="mainContent">')[1]
        text = text.split('<span class"idcode"')[0].split('(Der Bund)')[0]
        text = text.replace('***', ' ')
        if text.find('var badwordserch = 1;') >= 0:
            text = text.split('var badwordserch = 1;', 1)[1]
        paras = [' '.join(striptags(p).split()) for p in text.split('</p>')]
        for p in paras:
            if p:
                out.write(p + '\n')

Exemple #3

0

Afficher le fichier

def _find_tirreno_urls(crawler, category):
    site = 'http://iltirreno.gelocal.it/'
    urls = set()
    caturl = site + category
    catpage = crawler.fetch_content(caturl)
    num_pages = re.search(r'Pagina <span class="active">\d+</span> di (\d+)',
                          catpage)
    baseurl = re.search(r'<a title="Vai a pagina 1" href="([^"]+)"', catpage)
    if num_pages is None or baseurl is None:
        return urls
    num_pages = int(num_pages.group(1))
    baseurl = urljoin(site, baseurl.group(1))
    for p in range(1, num_pages + 1):
        url = '%s?page=%d' % (baseurl, p) if p > 1 else baseurl
        content = crawler.fetch_content(url)
        for u in re.findall(r'<h1><a href="([^"]+)">', content):
            u = urljoin(site, replace_html_entities(u.strip()))
            if not u.startswith('http://old.iltirreno.gelocal.it/'):
                urls.add(u)
    return urls

Exemple #4

0

Afficher le fichier

Fichier : crawl_iba.py Projet : zhezhe123/corpuscrawler

def _find_urls_utusan_borneo_berita_iban(crawler):
    urls = set()
    main = crawler.fetch_content('http://www.utusanborneo.com.my/iban')
    num_pages = max(map(int, re.findall('\?page=(\d+)', main)))
    for p in range(0, num_pages):
        index_url = 'http://www.utusanborneo.com.my/iban'
        if p > 0:
            index_url = index_url + '?page=%d' % p
        for url in re.findall(r'href="(/\d{4}/\d{2}/\d{2}/[^"]+)"',
                              crawler.fetch_content(index_url)):
            urls.add(urljoin(index_url, url))
    return urls

Exemple #5

0

Afficher le fichier

Fichier : crawl_mt.py Projet : zhezhe123/corpuscrawler

def crawl_newsbook_mt(crawler, out):
    urls = set()
    for section in ('internazzjonali', 'muzika', 'madwar-il-hajja',
                    'teknologijja', 'vatikan', 'sports', 'kummerc'):
        section_url = 'http://www.newsbook.com.mt/artikli/%s/' % section
        html = crawler.fetch(section_url).content.decode('utf-8')
        links = re.findall(r'/artikli/%s/(\d+)/' % section, html)
        num_toc_pages = max([int(x) for x in links])
        for i in range(1, num_toc_pages + 1):
            toc_url = section_url
            if i > 1:
                toc_url = toc_url + '%d/' % i
            html = crawler.fetch(toc_url).content.decode('utf-8')
            for u in re.findall('href="(/artikli/\d{4}/.+?)"', html):
                url = urljoin(toc_url, u)
                if url.find('/test') < 0:
                    urls.add(url)
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = re.search(r'<meta content="([^"]+?)" name="title"', html)
        if title is not None:
            title = cleantext(title.group(1))
        pubdate = re.search(
            r'<meta content="([^"]+?)" itemprop="datePublished"', html)
        if pubdate is not None:
            pubdate = pubdate.group(1).strip().replace(' ', 'T') + 'Z'
        content = html.split('<p>', 1)[1].split('<div', 1)[0]
        content = content.replace('\n', ' ').replace('</p>', '\n')
        paras = [
            fixquotes(cleantext(p)) for p in [title] + content.splitlines()
        ]
        paras = filter(None, paras)
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

Exemple #6

0

Afficher le fichier

Fichier : crawl_ba.py Projet : zhezhe123/corpuscrawler

def _crawl_yaikrb_ru(crawler, out):
    urls = set()
    # The site has an incomplete sitemap, so we also look at the archive pages.
    sitemap = crawler.fetch_sitemap('http://yaikrb.ru/sitemap.xml')
    for url in sorted(sitemap):
        crawler.fetch_content(url)
    main = crawler.fetch_content('http://yaikrb.ru/')
    archives = set([str(x) for x in range(1, 150)])
    archives.update(re.findall(r'/xf/num/([^/]+)/', main))
    for a in sorted(archives):
        doc = crawler.fetch(urlencode('http://yaikrb.ru/xf/num/%s/' % a))
        if doc.status != 200:
            continue
        for href in re.findall(r'<div class="n_more"><a href="([^"]+)"',
                               doc.content.decode('utf-8')):
            urls.add(urljoin('http://yaikrb.ru/', href, allow_fragments=False))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = re.search('<meta property="og:title" content="([^"]+)"', html)
        title = title.group(1) if title else ''
        text = extract('<div class="n_text">', '<div class="n_oth">', html)
        paras = clean_paragraphs('<h1>' + title + '</h1>' + text)
        if not paras:
            continue
        pubdate = re.search(
            r'<small>(\d{1,2})\.(\d{1,2})\.(20\d{2})\s*</small></h1>', html)
        if pubdate is not None:
            pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2),
                                    pubdate.group(1))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

Exemple #7

0

Afficher le fichier

Fichier : crawl_lb.py Projet : zhezhe123/corpuscrawler

def _crawl_rtl_lu(crawler, out):
    urls = set()
    homepage = crawler.fetch_content('http://www.rtl.lu/')
    cats = extract('<!-- MAIN NAVIGATION -->', '</header>', homepage)
    for cat in re.findall(r'href="(https?://www\.rtl\.lu/[^"]+?)">', cats):
        caturl = cat + 'archiv/'
        if cat.find('/sport/') > 0:
            caturl = caturl + 'all/'
        doc = crawler.fetch(caturl)
        if doc.status != 200:
            continue
        content = doc.content.decode('utf-8')
        num_pages = re.search(r'archiv\?p=(\d+)" class="last">&raquo', content)
        num_pages = int(num_pages.group(1)) if num_pages else 0
        for p in range(1, num_pages + 1):
            page = crawler.fetch_content(caturl[:-1] + '?p=%d' % p)
            html = extract('<div class="teaser archive-header">',
                           '<div class="pager">', page)
            if not html:
                continue
            for url in re.findall(r'href="([^"]+?)"', html):
                urls.add(urlencode(urljoin('http://www.rtl.lu/', url)))
    for url in sorted(urls):
        if url in BLACKLIST:
            continue
        doc = crawler.fetch_content(url)
        header = extract('<header>', '</header>', doc) or ''
        if header:
            header = header.replace('</span>', ' ')
        pubdate = re.search(
            '(\d{1,2})\.(\d{1,2}).(20\d{2}), (\d\d):(\d\d):(\d\d)</li>', doc)
        if pubdate:
            pd = [int(x) for x in pubdate.groups()]
            pubdate = '%04d-%02d-%02dT%02d:%02d:%02d+02:00' % (
                pd[2], pd[1], pd[0], pd[3], pd[4], pd[5])
        if doc.find('<section class="mainbar-right omega body">') > 0:
            start_tag = '<section class="mainbar-right omega body">'
        else:
            start_tag = '<p>'
        content = extract(start_tag, '<!-- BEGIN Comments -->', doc) or ''
        content = re.sub(r'<script.+?</script>', '', content, flags=re.DOTALL)
        content = re.sub(r'<form.+?</form>', '', content, flags=re.DOTALL)
        content = content.split('<footer')[0]
        content = content.split('<div class="pager"')[0]
        paras = clean_paragraphs(header + '<p/>' + content)
        paras = [
            p for p in paras
            if (p.find('Vous souhaitez faire') < 0 and p != 'äre Commentaire'
                and not p.startswith('####'))
        ]
        text = '\n'.join(paras)
        # Filter out some articles in French or German.
        if (text.find(' est ') >= 0 or text.find(' ist ') >= 0
                or text.find(' Ist ') >= 0 or text.find(' dit ') >= 0
                or text.find(' veut') >= 0):
            continue
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')