Example #1
0
def main(*args, **kwargs):
    bot = Spider()
    opts = {'database': kwargs['database']}
    bot.setup_cache(backend='mongo', **opts)

    ts = int(time.time())
    count = 0
    for item in bot.cache.db.cache.find({'timestamp': {'$exists': False}}, timeout=False):
        bot.cache.db.cache.update({'_id': item['_id']},
                                  {'$set': {'timestamp': ts}})
        count += 1
    print('Records updated: %d' % count)
Example #2
0
 def __init__(self):
     Spider.__init__(self, thread_number=1)
     self.setup_grab(timeout=120)
     self.repo = rdflib.Graph(store='default')
     self.repo.bind('foaf', FOAF)
     self.repo.bind('swc', SWC)
     self.repo.bind('skos', SKOS)
     self.repo.bind('swrc', SWRC)
     self.repo.bind('dbpedia-owl', DBPEDIAOWL)
     self.repo.bind('bibo', BIBO)
     self.repo.bind('dcterms', DCTERMS)
     self.repo.bind('dc', DC)
     self.repo.bind('timeline', TIMELINE)
Example #3
0
 def __init__(self):
     Spider.__init__(self, thread_number=1)
     self.setup_grab(timeout=120)
     self.repo = rdflib.Graph(store='default')
     self.repo.bind('foaf', FOAF)
     self.repo.bind('swc', SWC)
     self.repo.bind('skos', SKOS)
     self.repo.bind('swrc', SWRC)
     self.repo.bind('dbpedia-owl', DBPEDIAOWL)
     self.repo.bind('bibo', BIBO)
     self.repo.bind('dcterms', DCTERMS)
     self.repo.bind('dc', DC)
     self.repo.bind('timeline', TIMELINE)
Example #4
0
def main(*args, **kwargs):
    bot = Spider()
    opts = {'database': kwargs['database']}
    bot.setup_cache(backend='mongo', **opts)

    ts = int(time.time())
    count = 0
    for item in bot.cache.db.cache.find({'timestamp': {
            '$exists': False
    }},
                                        timeout=False):
        bot.cache.db.cache.update({'_id': item['_id']},
                                  {'$set': {
                                      'timestamp': ts
                                  }})
        count += 1
    print('Records updated: %d' % count)
Example #5
0
def main(*args, **kwargs):
    bot = Spider()
    opts = {'database': kwargs['database']}
    if kwargs.get('user'):
        opts['user'] = kwargs['user']
    if kwargs.get('passwd'):
        opts['passwd'] = kwargs['passwd']
    bot.setup_cache(backend='mysql', **opts)

    cursor = bot.cache.conn.cursor()
    cursor.execute('SELECT * FROM cache LIMIT 1')
    cols = [x[0] for x in cursor.description]
    ts = int(time.time())
    if not 'timestamp' in cols:
        print 'Cache table does not have timestamp column. Adding it...'
        cursor.execute('''
            ALTER TABLE cache
            ADD COLUMN timestamp INT NOT NULL DEFAULT %s''' % ts)
Example #6
0
def main(*args, **kwargs):
    bot = Spider()
    opts = {'database': kwargs['database']}
    if kwargs.get('user'):
        opts['user'] = kwargs['user']
    if kwargs.get('passwd'):
        opts['passwd'] = kwargs['passwd']
    bot.setup_cache(backend='mysql', **opts)

    cursor = bot.cache.conn.cursor()
    cursor.execute('SELECT * FROM cache LIMIT 1')
    cols = [x[0] for x in cursor.description]
    ts = int(time.time())
    if not 'timestamp' in cols:
        print('Cache table does not have timestamp column. Adding it...')
        cursor.execute('''
            ALTER TABLE cache
            ADD COLUMN timestamp INT NOT NULL DEFAULT %s''' % ts)
Example #7
0
def main(*args, **kwargs):
    bot = Spider()
    opts = {"database": kwargs["database"]}
    if kwargs.get("user"):
        opts["user"] = kwargs["user"]
    if kwargs.get("passwd"):
        opts["passwd"] = kwargs["passwd"]
    bot.setup_cache(backend="mysql", **opts)

    cursor = bot.cache.conn.cursor()
    cursor.execute("SELECT * FROM cache LIMIT 1")
    cols = [x[0] for x in cursor.description]
    ts = int(time.time())
    if not "timestamp" in cols:
        print("Cache table does not have timestamp column. Adding it...")
        cursor.execute(
            """
            ALTER TABLE cache
            ADD COLUMN timestamp INT NOT NULL DEFAULT %s"""
            % ts
        )
Example #8
0
def grab_control(request):
    form = ControlForm(request.GET or None)
    spider_registry = build_spider_registry(build_global_config())
    spider_choices = [(x, x) for x in spider_registry.keys()]
    form.fields['spider'].choices = spider_choices
    form.fields['spider'].widget.choices = spider_choices

    command_choices = [(x, x) for x in Spider.get_available_command_names()]
    form.fields['command'].choices = command_choices
    form.fields['command'].widget.choices = command_choices

    context = {
        'form': form,
    }
    return render(request, 'grabstat/control_form.html', context)
Example #9
0
def grab_control(request):
    form = ControlForm(request.GET or None)
    spider_registry = build_spider_registry(build_global_config())
    spider_choices = [(x, x) for x in spider_registry.keys()]
    form.fields['spider'].choices = spider_choices
    form.fields['spider'].widget.choices = spider_choices

    command_choices = [(x, x) for x in Spider.get_available_command_names()]
    form.fields['command'].choices = command_choices
    form.fields['command'].widget.choices = command_choices

    context = {
        'form': form,
    }
    return render(request, 'grabstat/control_form.html', context)
Example #10
0
 def __init__(self,
              platform,
              policy,
              thread_number=10,
              proxy_support=False,
              proxy_servers=None,
              proxy_type='http'):
     """
     :param thread_number: The number of threads
     :param proxy_support: Enable/disable proxy proxy_support
     :param proxy_servers: List of proxies ['host:port', 'host:port', etc.]
     :param proxy_type: The type of the proxy (http, socks4, socks5)
     """
     self.platform = platform
     self.proxy_servers = proxy_servers
     self.proxy_type = proxy_type
     if proxy_support:
         self.get_proxy = lambda: choice(self.proxy_servers)
     else:
         self.get_proxy = lambda: None
         self.proxy_type = None
     self.posts = {}
     Fetcher.__init__(self, platform, policy)
     Spider.__init__(self, thread_number=thread_number, network_try_limit=5)
Example #11
0
# !/usr/bin/env python
# coding: utf-8
from grab.spider import Spider, Task
from weblib.logs import default_logging
from grab import Grab
import logging

from database import db


class {{ PROJECT_NAME_CAMELCASE }}Spider(Spider):  # noqa
    def task_generator(self):
        yield Task('initial', url='')

    def task_initial(self, grab, task):
        pass
from pprint import pprint
from urllib.parse import quote_plus, urlsplit, urljoin

from grab.spider import Spider, Task
from grab import Grab
from grab.spider.decorators import integrity
from weblib.error import DataNotValid

from project.database import db


class {{ cookiecutter.project_name.title().replace('_', '') }}Spider(Spider):
    def task_generator(self):
        pass
 def shutdown(self):
     Spider.shutdown(self)
     f = open('rdfdb.ttl', 'w')
     self.repo.serialize(f, format='turtle')
     self.repo.close()
Example #14
0
 def __init__(self, id_list, thread_number):
     Spider.__init__(self, thread_number=thread_number)
     self.id_list = id_list
Example #15
0
 def shutdown(self):
     Spider.shutdown(self)
     f = open('rdfdb.ttl', 'w')
     self.repo.serialize(f, format='turtle')
     self.repo.close()