Example #1
0
def main(argv):
    getDeleted = False

    opts, args = getopt.getopt(argv, "d", ["deleted"])

    for opt, arg in opts:
        if opt in ("-d", "--deleted"):
            getDeleted = True

            # create schema if db not exists
    if not os.path.isfile(config.database):
        createSchema = True
    else:
        createSchema = False

    connection = Connection(config.apikey)

    # connect to database
    db = sqlite3.connect(config.database)
    cursor = db.cursor()

    if createSchema:
        # open schema definition file
        file = open("schema.sql", "r")
        for line in file.readlines():
            cursor.execute(line)
        db.commit()

        # get projects
    project_ids = connection.project_ids()

    for project_id in project_ids:
        project = connection[project_id]

        cursor.execute(
            """SELECT *
			FROM shub_projects
			WHERE shub_id = ?""",
            (project.id,),
        )

        # insert if not exist
        if len(cursor.fetchall()) == 0:
            print "Guardando proyecto " + str(project.id) + "..."
            db.execute(
                """INSERT INTO shub_projects (shub_id)
							VALUES (?)""",
                [project.id],
            )

            # get finished jobs
        getJobs(db, project.jobs(state="finished"))

        # get deleted jobs
        if getDeleted:
            getJobs(db, project.jobs(state="deleted"))

    db.close()
Example #2
0
def _has_project_access(project, endpoint, apikey):
    conn = Connection(apikey, url=endpoint)
    try:
        return project in conn.project_ids()
    except APIError as e:
        if "Authentication failed" in e.message:
            raise InvalidAuthException
        else:
            raise RemoteErrorException(e.message)
Example #3
0
def _has_project_access(project, endpoint, apikey):
    conn = Connection(apikey, url=endpoint)
    try:
        return project in conn.project_ids()
    except APIError as e:
        if 'Authentication failed' in e.message:
            raise InvalidAuthException
        else:
            raise RemoteErrorException(e.message)
Example #4
0
File: utils.py Project: rowhit/shub
def has_project_access(project, endpoint, apikey):
    """Check whether an API key has access to a given project. May raise
    InvalidAuthException if the API key is invalid (but not if it is valid but
    lacks access to the project)"""
    conn = Connection(apikey, url=endpoint)
    try:
        return project in conn.project_ids()
    except APIError as e:
        if 'Authentication failed' in str(e):
            raise InvalidAuthException
        else:
            raise RemoteErrorException(str(e))
Example #5
0
def fetchHourly(request):
    conn = Connection(settings.SCRAPINGHUB_KEY)
    project = conn[65427]
    jobs = project.jobs(state='finished', has_tags='hourly', count=2)
    for job in jobs:
        saveItems(job.items())
    return HttpResponse('success')
Example #6
0
def fetch_hourly():
    logger.info("Start task")
    conn = Connection(settings.SCRAPINGHUB_KEY)
    project = conn[65427]
    jobs = project.jobs(state='finished', has_tags='hourly', count=2)
    for job in jobs:
        saveItems(job.items())
Example #7
0
    def __init__(self, crawler):
        settings = crawler.settings
        self.hs_endpoint = settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(settings, "HS_AUTH")
        self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID'))
        self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
        self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
        self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
        self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])

        conn = Connection(self.hs_auth)
        self.panel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(set)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()
Example #8
0
def get_last_24h_jobs(apikey, project_id):
    """Fetch jobs that finished in the last 24 hours
    """
    project = Project(Connection(apikey), project_id)
    since_time = datetime.utcnow() - timedelta(hours=24)
    jobs = [
        job for job in project.jobs(state='finished')
        if is_job_newer_than(job, since_time)
    ]
    return jobs
Example #9
0
def schedule_spider(project, endpoint, apikey, spider, arguments=(),
                    settings=()):
    conn = Connection(apikey, url=endpoint)
    try:
        return conn[project].schedule(
            spider,
            job_settings=json.dumps(dict(x.split('=', 1) for x in settings)),
            **dict(x.split('=', 1) for x in arguments)
        )
    except APIError as e:
        raise RemoteErrorException(str(e))
Example #10
0
def schedule_spider(project, endpoint, apikey, spider, arguments=(), settings=(),
                    priority=DEFAULT_PRIORITY, units=None, tag=()):
    conn = Connection(apikey, url=endpoint)
    try:
        kw = dict(x.split('=', 1) for x in arguments)
        if units is not None:
            kw['units'] = units
        return conn[project].schedule(
            spider,
            job_settings=json.dumps(dict(x.split('=', 1) for x in settings)),
            priority=priority,
            add_tag=tag,
            **kw
        )
    except APIError as e:
        raise RemoteErrorException(str(e))
Example #11
0
class ConnectionTest(unittest.TestCase):

    def setUp(self):
        self.client = Connection('http://server/api/', 'john', 'doe')

    def test_initialization(self):
        self.assertEqual(self.client.url, 'http://server/api/')
        self.assertEqual(str(self.client), 'Connection(http://server/api/)')
        self.assertEqual(self.client._request_headers['Authorization'], 'Basic am9objpkb2U=')

    def test_project_names(self):
        content = json.dumps(dict(status='ok', projects=['foo', 'bar']))
        with MockResponse(self.client, content) as mock:
            self.assertEqual(self.client.project_names(), ['foo', 'bar'])

            self.assertEqual(mock.url,
                             'http://server/api/scrapyd/listprojects.json')
            self.assertEqual(mock.data, None)
Example #12
0
    def __init__(self, crawler):

        self.crawler = crawler
        self.hs_endpoint = crawler.settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(crawler,
                                                     "HS_CONSUME_FROM_SLOT")
        try:
            self.hs_number_of_slots = int(
                crawler.settings.get("HS_NUMBER_OF_SLOTS",
                                     DEFAULT_HS_NUMBER_OF_SLOTS))
        except ValueError:
            self.hs_number_of_slots = DEFAULT_HS_NUMBER_OF_SLOTS
        try:
            self.hs_max_links = int(
                crawler.settings.get("HS_MAX_LINKS", DEFAULT_MAX_LINKS))
        except ValueError:
            self.hs_max_links = DEFAULT_MAX_LINKS
        self.hs_start_job_enabled = crawler.settings.get(
            "HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = crawler.settings.get(
            "HS_START_JOB_ON_REASON", ['finished'])
        self.hs_start_job_new_panel = crawler.settings.get(
            "HS_START_JOB_NEW_PANEL", False)

        if not self.hs_start_job_new_panel:
            conn = Connection(self.hs_auth)
            self.oldpanel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth,
                                         endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links_count = defaultdict(int)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()
Example #13
0
    def handle(self, *args, **kwargs):
        conn = Connection('1a22b051feb8448fa71bb1cc2ea4aa9c')
        project = conn[62659]
        job_id = kwargs['job_id']
        feed_type = kwargs['type']

        processor_maps = {
            'nsw': process_nsw,
            'act': process_act,
            'qld': process_qld,
            'sa': process_sa,
            'vic': process_vic,
            'nt': process_nt,
            'wa': process_wa,
        }
        processor = processor_maps[feed_type]
        if not processor:
            raise Exception('Unknown type {0}'.format(feed_type))

        for item in project.job(job_id).items():
            dict = processor(item)
            JusticeOfPeace.objects.create(**dict)
Example #14
0
# !/usr/bin/python
# -*- coding: utf-8 -*-
import MySQLdb
import sys
from geopy.exc import GeocoderTimedOut
sys.path.insert(0, 'C:\Users\home\greekbeachesapp')
from sunthesis import elegxos_paralias
from scrapinghub import Connection
from geopy.geocoders import Nominatim
import time
conn = Connection('42e697f0d35348e9b6c70d6e74dcbc93')
print conn
print conn.project_ids()
project = conn[78127]
jobs = project.jobs(state='finished')
jobs_id = [x.id for x in jobs]
print jobs_id
job = project.job(u'78127/1/1')
source = 'thassos-view.com'
servername = "localhost"
username = "******"
password = ""
dbname = "database"
db = MySQLdb.connect(servername,
                     username,
                     password,
                     dbname,
                     charset="utf8",
                     use_unicode=True)
cursor = db.cursor()
for item in job.items():
Example #15
0
# !/usr/bin/python
# -*- coding: utf-8 -*-
import MySQLdb
import sys
from geopy.exc import GeocoderTimedOut
sys.path.insert(0, 'C:\Users\home\greekbeachesapp')
from sunthesis import elegxos_paralias
from scrapinghub import Connection
from geopy.geocoders import Nominatim
import time
conn = Connection('57c54e0d05fb44439a9fc0887fb6ab3e')
print conn
print conn.project_ids()
project = conn[77608]
jobs = project.jobs(state='finished')
jobs_id = [x.id for x in jobs]
print jobs_id
job = project.job(u'77608/1/1')
source = 'e-zakynthos.com'
servername = "localhost"
username = "******"
password = ""
dbname = "database"
db = MySQLdb.connect(servername,
                     username,
                     password,
                     dbname,
                     charset="utf8",
                     use_unicode=True)
cursor = db.cursor()
z = u'\u0396\u03ac\u03ba\u03c5\u03bd\u03b8\u03bf\u03c2'
Example #16
0
def main(args):
    project = Connection(args.apikey)[args.project_id]
    items = fetch_latest_job_items(project)
    for it in rank_items(items, args.top):
        print(it)
def test_connection_init_with_default_url():
    conn = Connection(apikey='testkey')
    assert conn.url == Connection.DEFAULT_ENDPOINT
def test_connection_init_assert_apikey_not_url():
    with pytest.raises(AssertionError):
        Connection(password='******', apikey='http://some-url')
def test_connection_init_use_key_from_env():
    conn = Connection()
    assert conn.apikey == 'testkey'
def test_connection_init_fail_wo_apikey(monkeypatch):
    monkeypatch.delenv('SH_APIKEY', raising=False)
    with pytest.raises(RuntimeError):
        Connection()
Example #21
0
 def setUp(self):
     self.client = Connection('http://server/api/', 'john', 'doe')
def test_connection_init_with_custom_timeout():
    conn = Connection(apikey='testkey', connection_timeout=60)
    assert conn._connection_timeout == 60
def test_connection_init_with_default_timeout():
    conn = Connection(apikey='testkey')
    assert conn._connection_timeout is None
def connection():
    return Connection(apikey='testkey', url='http://test-url')