Beispiel #1
0
import csv
import lxml.html
from link_crawler import link_crawler


class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('D:\countries.csv',
                                      'w'))  # 'w'表示写入模式,写入D盘countries文件中
        self.fields = ('area', 'population', 'iso', 'country', 'capital',
                       'continent', 'tld', 'currency_code', 'currency_name',
                       'phone', 'postal_code_format', 'postal_code_regex',
                       'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(
                    tree.cssselect('table > tr#places_%s__row > td.w2p_fw' %
                                   field)[0].text_content())
            self.writer.writerow(row)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com',
                 '/places/default/(index|view)',
                 max_depth=1,
                 scrape_callback=ScrapeCallback())
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def has_expired(self, timestamp):
        """
        Return whether this timestamp has expired
        """
        return datetime.utcnow() > timestamp + self.expires

    def clear(self):
        """
        Remove all the cached values
        """
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/',
                 '/(index|view)',
                 cache=DiskCache())
Beispiel #3
0
        data=pickle.dumps((result,datetime.utcnow()))
        if self.compress:
            data=zlib.compress(data)

        with open(path,'wb') as fp:
            fp.write(data)

    def __delitem__(self, url):
        #删除这个key的结果
        path=self.url_to_path(url)
        try:
            os.remove(path)
            os.removedirs(os.path.dirname(path))
        except OSError:
            pass

    def has_expired(self,timestamp):
        return datetime.utcnow()>timestamp+self.expires


    def clear(self):
        #删除所有的cache缓存
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)

if __name__ == '__main__':
    link_crawler('http://weibo.com/u/5249921593','/5249921593', delay=0, num_retries=1, max_depth=2,user_agent='Baiduspider')
    #cache=DiskCache()
    #print cache['http://example.webscraping.com/view/Afghanistan-1']

Beispiel #4
0
# -*- coding: utf-8 -*-

import csv
import re
import urlparse
import lxml.html
from link_crawler import link_crawler

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone',
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = [
            tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(
                field))[0].text_content() for field in FIELDS
        ]
        print url, row


if __name__ == '__main__':
    link_crawler('http://127.0.0.1:8000/places/default/',
                 '/places/default/(index|view)',
                 scrape_callback=scrape_callback)
        path = self.url_to_path(url)
        if os.path.exists(path):
            with open(path, 'rb') as fp:
                return pickle.loads(fp.read())
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def url_to_path(self, url):
        """Create file system path for this URL
        """
        components = urlparse.urlsplit(url)
        # when empty path set to /index.html
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)


if __name__ == '__main__':
    link_crawler('http://127.0.0.1:8000/places/',
                 '/places/default/(index|view)/',
                 cache=DiskCache())
Beispiel #6
0
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital',
                       'continent', 'tld', 'currency_code', 'currency_name',
                       'phone', 'postal_code_format', 'postal_code_regex',
                       'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            try:
                tree = lxml.html.fromstring(html)
                row = []
                for field in self.fields:
                    row.append(
                        tree.cssselect(
                            'table > tr#places_{}__row > td.w2p_fw'.format(
                                field))[0].text_content())
                self.writer.writerow(row)
            except lxml.etree.ParserError as pse:
                print("url:{0}\thtml:{1}".format(url, html),
                      traceback.print_exc())


if __name__ == '__main__':
    # link_crawler('http://example.webscraping.com/places/default/view/Afghanistan-1', '/(index|view)', scrape_callback=ScrapeCallback())

    link_crawler(
        'http://example.webscraping.com/places/default/view/Afghanistan-1',
        '/places/default/(index|view)',
        scrape_callback=ScrapeCallback())
Beispiel #7
0
# -*- coding: utf-8 -*-

import csv
import re
import urlparse
import lxml.html
from link_crawler import link_crawler

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = [tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content() for field in FIELDS]
        print url, row


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/', '/(index|view)', scrape_callback=scrape_callback)

import lxml.html
import csv
import urllib2 
from link_crawler import link_crawler
import re

class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                l = tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))
                if len(l) > 0: 
                    row.append(l[0].text_content())
            self.writer.writerow(row)



link_crawler('http://example.webscraping.com/', '/(index|view)', max_depth=1, delay=0, scrape_callback=ScrapeCallback())


# print tree.cssselect('table > tr#places_area__row > td.w2p_fw')[0].text_content()
# print tree.cssselect('table > tr#places_area__row > td.w2p_fw')
# print tree.cssselect('table > tr#places_area__row > td.w2p_fw')
Beispiel #9
0
    def __delitem__(self, url):
        path = self.url_to_path(url)
        try:
            os.remove(path)
        except OSError:
            pass

    def url_to_path(self, url):
        md5url = md5_str(url)
        return os.path.realpath(os.path.join(self.cache_dir, md5url))

    def has_expired(self, timestamp):
        return datetime.utcnow() > (timestamp + self.expires)

    def clear(self):
        """
        删除所有cache
        """
        if os.path.isdir(self.cache_dir):
            shutil.rmtree(self.cache_dir)


if __name__ == '__main__':
    from datetime import timedelta
    link_crawler('http://example.webscraping.com/',
                 delay=3,
                 link_regex='/(index|view)',
                 max_urls=-1,
                 cache=DiskCache(expires=timedelta(hours=1)))
Beispiel #10
0
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def has_expired(self, timestamp):
        """Return whether this timestamp has expired
        """
        return datetime.utcnow() > timestamp + self.expires

    def clear(self):
        """Remove all the cached values
        """
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)


# if __name__ == '__main__':
#     link_crawler('http://example.webscraping.com', '/places/default/(index|view)/', delay=5, num_retries=1, max_depth=1,user_agent='GoodCrawler',cache=DiskCache())

link_crawler('http://example.webscraping.com',
             '/places/default/(index|view)/',
             max_urls=20,
             delay=5,
             num_retries=1,
             max_depth=1,
             cache=DiskCache())
import json


#使用回调类而非回调函数以保持csv中write属性的状态
class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('weather.csv', 'w', newline=''))

        #天气 最高温/最低温 风力
        self.fields = ('天气', '最高/低温', '风力')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        #if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        td = tree.cssselect('p.wea')
        n = 0
        for wea in td:
            row = []
            row.append(tree.cssselect('p.wea')[n].text_content().strip('\n'))
            row.append(tree.cssselect('p.tem')[n].text_content().strip('\n'))
            row.append(tree.cssselect('p.win')[n].text_content().strip('\n'))
            n = n + 1
            self.writer.writerow(row)


if __name__ == '__main__':
    link_crawler('http://www.weather.com.cn/weather/101020100.shtml',
                 '/(index|view)',
                 scrape_callback=ScrapeCallback())
        except OSError:
            pass

    def url_to_path(self, url):
        components = urllib.parse.urlsplit(url)
        # when empty path set to /index.html
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def has_expired(self, timestamp):
        return datetime.utcnow() > timestamp + self.expires

    def clear(self):
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com',
                 '/places/default/(index|view)',
                 max_depth=1,
                 cache=DiskCache())
Beispiel #13
0
        else:
            return True

    def __getitem__(self, url):
        """Load value at this URL
        """
        record = self.db.webpage.find_one({'_id': url})
        if record:
            # return record['result']
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        """Save value for this URL
        """
        # record = {'result': result, 'timestamp': datetime.utcnow()}
        record = {
            'result': Binary(zlib.compress(pickle.dumps(result))),
            'timestamp': datetime.utcnow()
        }
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)

    def clear(self):
        self.db.webpage.drop()


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/places/default/index',
                 '/places/default/view|/places/default/index',
                 cache=MongoCache())
Beispiel #14
0
import lxml.html
import re
from link_crawler import link_crawler

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone',
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = [
            tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(
                field))[0].text_content() for field in FIELDS
        ]
        print(url, row)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com',
                 '/places/default/(index|view)',
                 scrape_callback=scrape_callback)
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, timeout=10, ignore_robots=True)
Beispiel #16
0
import re
import urlparse
import lxml.html
from link_crawler import link_crawler


class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital',
                       'continent', 'tld', 'currency_code', 'currency_name',
                       'phone', 'postal_code_format', 'postal_code_regex',
                       'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(
                    tree.cssselect(
                        'table > tr#places_{}__row > td.w2p_fw'.format(field))
                    [0].text_content())
            self.writer.writerow(row)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/places/default/index',
                 '/places/default/view|/places/default/index',
                 scrape_callback=ScrapeCallback())
Beispiel #17
0
# with ZipFile(StringIO(zipped_data)) as zf:
#     csv_filename = zf.namelist()[0]
#     print(csv_filename)
#     for _, website in csv.reader(zf.open(csv_filename)):
#         print(_ + "  " + website)
#         urls.append('http://' + website)


class AlexaCallback:
    def __init__(self, max_urls=100):
        self.max_urls = max_urls
        self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'

    def __call__(self, url, html):
        if url == self.seed_url:
            urls = []
            print(html)
            with ZipFile(StringIO(html)) as zf:
                csv_filename = zf.namelist()[0]
                for _, website in csv.reader(zf.open(csv_filename)):
                    urls.append('http://' + website)
                    if len(urls) == self.max_urls:
                        break
            return urls


if __name__ == '__main__':
    link_crawler(seed_url=AlexaCallback().seed_url,
                 cache=DiskCache(),
                 scrape_callback=AlexaCallback())
Beispiel #18
0
import sys
sys.path.insert(0, './scripts')

import link_crawler
import scrape_callback
from scrape_callback import ScrapeCallback

if __name__ == '__main__':
    link_crawler.link_crawler('http://example.webscraping.com',
                              '/(places/default/index)',
                              delay=1,
                              num_retries=1,
                              max_depth=1,
                              user_agent='GoodCrawler')
    scrape_callback.link_crawler('http://example.webscraping.com/',
                                 '/(places/default/index)',
                                 scrape_callback=ScrapeCallback())
Beispiel #19
0
        if self.compress:
            data=zlib.compress(data)

        with open(path,'wb') as fp:
            fp.write(data)

    def __delitem__(self, url):
        #删除这个key的结果
        path=self.url_to_path(url)
        try:
            os.remove(path)
            os.removedirs(os.path.dirname(path))
        except OSError:
            pass

    def has_expired(self,timestamp):
        return datetime.utcnow()>timestamp+self.expires


    def clear(self):
        #删除所有的cache缓存
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)

if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/', '/(index|view)',max_depth=1, cache=DiskCache(compress=False))
    #cache=DiskCache()
    #print cache['http://example.webscraping.com/view/Afghanistan-1']

 
import csv
import re
import urllib.parse
import lxml.html
from link_crawler import link_crawler

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone',
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = [
            tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(
                field))[0].text_content() for field in FIELDS
        ]
        print(url, row)


if __name__ == '__main__':
    url = 'http://example.webscraping.com/'
    link_crawler(url, '/(index|view)', scrape_callback=scrape_callback)
Beispiel #21
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import re
import urllib
import lxml.html
from link_crawler import link_crawler

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone',
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = [
            tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(
                field))[0].text_content() for field in FIELDS
        ]
        print(url, row)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com',
                 '/places/default/(index|view)',
                 delay=0,
                 num_retries=1,
                 max_depth=1,
                 scrape_callback=scrape_callback)
Beispiel #23
0
from link_crawler import link_crawler
from scrapecallback import ScrapeCallback
from disk_cache import DiskCache

# link_crawler('http://example.webscraping.com/', '/(index|view)', max_depth=1, delay=0, scrape_callback=ScrapeCallback())

link_crawler('http://example.webscraping.com/', '/(index|view)', max_depth=1, delay=0, cache=DiskCache())
            os.remove(path)
            os.removedirs(os.path.dirname(path))
        except OSError:
            pass

    def has_expired(self, timestamp):
        return datetime.utcnow() > timestamp + self.expires

    def clear(self):
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)


class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'wb'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
            self.writer.writerow(row)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/', '/(index|view)', scrape_callback=ScrapeCallback(),cache=DiskCache())
Beispiel #25
0
        # when empty path set to /index.html
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)


    def has_expired(self, timestamp):
        """Return whether this timestamp has expired
        """
        return datetime.utcnow() > timestamp + self.expires


    def clear(self):
        """Remove all the cached values
        """
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)



if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/', '/(places/default/view)', cache=DiskCache())
Beispiel #26
0
from scraper import scraper
from link_crawler import link_crawler

START_URL = 'https://www.statista.com/map/'

if __name__ == '__main__':
    follow = r'/map/'
    link_crawler(START_URL, link_regex=follow, scraper_callback=scraper)
Beispiel #27
0
import re
import lxml.html
import link_crawler

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone',
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = [
            tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(
                field))[0].text_content() for field in FIELDS
        ]
        print url, row


if __name__ == '__main__':
    link_crawler.link_crawler('http://example.webscraping.com/',
                              '/(index|view)',
                              scrape_callback=scrape_callback)
Beispiel #28
0
                        item.get_text().encode('utf-8').replace('\'', '\'\'')
                        for item in content_right.find_all("span",
                                                           class_="rating_per")
                    ])
                else:
                    movie.extend(["", "", "", "", ""])
            else:
                movie.extend(["", "", "", "", "", "", ""])

            # 总共25个字段,不符抛出异常
            assert len(movie) == 25, "length of movie is invalid"

            # 写入excel
            self.writer.write_row(self.col, 0, movie)
            # 每条记录写入excel后,将行数+1
            self.col += 1

            # 写入数据库
            tb_movie = str(tuple([field.decode('utf-8') for field in movie
                                  ])).replace('u\'',
                                              '\'').decode("unicode-escape")
            wswpdb = WswpDb()
            wswpdb.insert_wswp_db(self.tb_name, (',').join(self.tb_fields),
                                  tb_movie)


if __name__ == '__main__':
    link_crawler('http://movie.douban.com',
                 '/subject/',
                 scrape_callback=ScrapeCallback())
Beispiel #29
0
        if os.path.exists(path):
            with open(path, 'rb') as fp:
                data = fp.read()
                if self.compress:
                    data = zlib.decompress(data)
                return pickle.loads(data)
        else:
            # rURL has not yes been cached
            raise KeyError(url + ' does not exist!')

    # 将下载的网页缓存起来
    def __setitem__(self, url, result):
        """Load data from disk for this URL"""
        path = self.url_to_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)

        data = pickle.dumps(result)
        with open(path, 'wb') as fp:
            if self.compress:
                data = zlib.compress(data)
            fp.write(data)

if __name__ == '__main__':
    import time
    start = time.time()
    link_crawler(seed_url='http://example.webscraping.com/', link_regex='/(index|view)',
                 cache=DiskCache())
    end = time.time()
    print end - start
Beispiel #30
0
        """Create file system path for this URL
        """
        components = urlparse.urlsplit(url)
        # when empty path set to /index.html
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def has_expired(self, timestamp):
        """Return whether this timestamp has expired
        """
        return datetime.utcnow() > timestamp + self.expires

    def clear(self):
        """Remove all the cached values
        """
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/', '(.*?)/default/view(.*?)', cache=DiskCache())
Beispiel #31
0
            return True

    def __getitem__(self, url):
        """Load value at this URL
        """
        record = self.db.webpage.find_one({'_id': url})
        if record:
            #return record['result']
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        """Save value for this URL
        """
        #record = {'result': result, 'timestamp': datetime.utcnow()}
        record = {
            'result': Binary(zlib.compress(pickle.dumps(result))),
            'timestamp': datetime.utcnow()
        }
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)

    def clear(self):
        self.db.webpage.drop()


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/places/default/',
                 '/(index|view)',
                 cache=MongoCache())
Beispiel #32
0
            return False
        else:
            return True

    def __getitem__(self, url):
        record = self.db.webpage.find_one({'_id': url})
        if record:
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        record = {
            'result': Binary(zlib.compress(pickle.dumps(result))),
            'timestamp': datetime.utcnow()
        }
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)

    def clear(self):
        self.db.webpage.drop()


if __name__ == '__main__':
    cache = MongoCache()
    # url = 'http://example.webscraping.com'
    # result = {'html':'_'}
    # cache[url] = result
    # print (cache[url])
    link_crawler('http://example.webscraping.com',
                 link_regex='/(index|view)',
                 cache=cache)
Beispiel #33
0
        # when empty path set to /index.html
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)


    def has_expired(self, timestamp):
        """Return whether this timestamp has expired
        """
        return datetime.utcnow() > timestamp + self.expires


    def clear(self):
        """Remove all the cached values
        """
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)



if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/', '/(index|view)', cache=DiskCache())
Beispiel #34
0
CSV_FILE = 'countries.csv'
FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone',
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open(CSV_FILE, 'w'))
        self.fields = FIELDS
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = list()
            for field in self.fields:
                row.append(
                    tree.cssselect(
                        'table > tr#places_{}__row > td.w2p_fw'.format(field))
                    [0].text_content())
            print(
                'URL match!  --> scraping and writing to {}'.format(CSV_FILE))
            self.writer.writerow(row)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com/',
                 link_regex='/view',
                 scrape_callback=ScrapeCallback())