def main(*args, **kwargs): bot = Spider() opts = {'database': kwargs['database']} bot.setup_cache(backend='mongo', **opts) ts = int(time.time()) count = 0 for item in bot.cache.db.cache.find({'timestamp': {'$exists': False}}, timeout=False): bot.cache.db.cache.update({'_id': item['_id']}, {'$set': {'timestamp': ts}}) count += 1 print('Records updated: %d' % count)
def __init__(self): Spider.__init__(self, thread_number=1) self.setup_grab(timeout=120) self.repo = rdflib.Graph(store='default') self.repo.bind('foaf', FOAF) self.repo.bind('swc', SWC) self.repo.bind('skos', SKOS) self.repo.bind('swrc', SWRC) self.repo.bind('dbpedia-owl', DBPEDIAOWL) self.repo.bind('bibo', BIBO) self.repo.bind('dcterms', DCTERMS) self.repo.bind('dc', DC) self.repo.bind('timeline', TIMELINE)
def main(*args, **kwargs): bot = Spider() opts = {'database': kwargs['database']} bot.setup_cache(backend='mongo', **opts) ts = int(time.time()) count = 0 for item in bot.cache.db.cache.find({'timestamp': { '$exists': False }}, timeout=False): bot.cache.db.cache.update({'_id': item['_id']}, {'$set': { 'timestamp': ts }}) count += 1 print('Records updated: %d' % count)
def main(*args, **kwargs): bot = Spider() opts = {'database': kwargs['database']} if kwargs.get('user'): opts['user'] = kwargs['user'] if kwargs.get('passwd'): opts['passwd'] = kwargs['passwd'] bot.setup_cache(backend='mysql', **opts) cursor = bot.cache.conn.cursor() cursor.execute('SELECT * FROM cache LIMIT 1') cols = [x[0] for x in cursor.description] ts = int(time.time()) if not 'timestamp' in cols: print 'Cache table does not have timestamp column. Adding it...' cursor.execute(''' ALTER TABLE cache ADD COLUMN timestamp INT NOT NULL DEFAULT %s''' % ts)
def main(*args, **kwargs): bot = Spider() opts = {'database': kwargs['database']} if kwargs.get('user'): opts['user'] = kwargs['user'] if kwargs.get('passwd'): opts['passwd'] = kwargs['passwd'] bot.setup_cache(backend='mysql', **opts) cursor = bot.cache.conn.cursor() cursor.execute('SELECT * FROM cache LIMIT 1') cols = [x[0] for x in cursor.description] ts = int(time.time()) if not 'timestamp' in cols: print('Cache table does not have timestamp column. Adding it...') cursor.execute(''' ALTER TABLE cache ADD COLUMN timestamp INT NOT NULL DEFAULT %s''' % ts)
def main(*args, **kwargs): bot = Spider() opts = {"database": kwargs["database"]} if kwargs.get("user"): opts["user"] = kwargs["user"] if kwargs.get("passwd"): opts["passwd"] = kwargs["passwd"] bot.setup_cache(backend="mysql", **opts) cursor = bot.cache.conn.cursor() cursor.execute("SELECT * FROM cache LIMIT 1") cols = [x[0] for x in cursor.description] ts = int(time.time()) if not "timestamp" in cols: print("Cache table does not have timestamp column. Adding it...") cursor.execute( """ ALTER TABLE cache ADD COLUMN timestamp INT NOT NULL DEFAULT %s""" % ts )
def grab_control(request): form = ControlForm(request.GET or None) spider_registry = build_spider_registry(build_global_config()) spider_choices = [(x, x) for x in spider_registry.keys()] form.fields['spider'].choices = spider_choices form.fields['spider'].widget.choices = spider_choices command_choices = [(x, x) for x in Spider.get_available_command_names()] form.fields['command'].choices = command_choices form.fields['command'].widget.choices = command_choices context = { 'form': form, } return render(request, 'grabstat/control_form.html', context)
def __init__(self, platform, policy, thread_number=10, proxy_support=False, proxy_servers=None, proxy_type='http'): """ :param thread_number: The number of threads :param proxy_support: Enable/disable proxy proxy_support :param proxy_servers: List of proxies ['host:port', 'host:port', etc.] :param proxy_type: The type of the proxy (http, socks4, socks5) """ self.platform = platform self.proxy_servers = proxy_servers self.proxy_type = proxy_type if proxy_support: self.get_proxy = lambda: choice(self.proxy_servers) else: self.get_proxy = lambda: None self.proxy_type = None self.posts = {} Fetcher.__init__(self, platform, policy) Spider.__init__(self, thread_number=thread_number, network_try_limit=5)
# !/usr/bin/env python # coding: utf-8 from grab.spider import Spider, Task from weblib.logs import default_logging from grab import Grab import logging from database import db class {{ PROJECT_NAME_CAMELCASE }}Spider(Spider): # noqa def task_generator(self): yield Task('initial', url='') def task_initial(self, grab, task): pass
from pprint import pprint from urllib.parse import quote_plus, urlsplit, urljoin from grab.spider import Spider, Task from grab import Grab from grab.spider.decorators import integrity from weblib.error import DataNotValid from project.database import db class {{ cookiecutter.project_name.title().replace('_', '') }}Spider(Spider): def task_generator(self): pass
def shutdown(self): Spider.shutdown(self) f = open('rdfdb.ttl', 'w') self.repo.serialize(f, format='turtle') self.repo.close()
def __init__(self, id_list, thread_number): Spider.__init__(self, thread_number=thread_number) self.id_list = id_list