def main(argv): getDeleted = False opts, args = getopt.getopt(argv, "d", ["deleted"]) for opt, arg in opts: if opt in ("-d", "--deleted"): getDeleted = True # create schema if db not exists if not os.path.isfile(config.database): createSchema = True else: createSchema = False connection = Connection(config.apikey) # connect to database db = sqlite3.connect(config.database) cursor = db.cursor() if createSchema: # open schema definition file file = open("schema.sql", "r") for line in file.readlines(): cursor.execute(line) db.commit() # get projects project_ids = connection.project_ids() for project_id in project_ids: project = connection[project_id] cursor.execute( """SELECT * FROM shub_projects WHERE shub_id = ?""", (project.id,), ) # insert if not exist if len(cursor.fetchall()) == 0: print "Guardando proyecto " + str(project.id) + "..." db.execute( """INSERT INTO shub_projects (shub_id) VALUES (?)""", [project.id], ) # get finished jobs getJobs(db, project.jobs(state="finished")) # get deleted jobs if getDeleted: getJobs(db, project.jobs(state="deleted")) db.close()
def _has_project_access(project, endpoint, apikey): conn = Connection(apikey, url=endpoint) try: return project in conn.project_ids() except APIError as e: if "Authentication failed" in e.message: raise InvalidAuthException else: raise RemoteErrorException(e.message)
def _has_project_access(project, endpoint, apikey): conn = Connection(apikey, url=endpoint) try: return project in conn.project_ids() except APIError as e: if 'Authentication failed' in e.message: raise InvalidAuthException else: raise RemoteErrorException(e.message)
def has_project_access(project, endpoint, apikey): """Check whether an API key has access to a given project. May raise InvalidAuthException if the API key is invalid (but not if it is valid but lacks access to the project)""" conn = Connection(apikey, url=endpoint) try: return project in conn.project_ids() except APIError as e: if 'Authentication failed' in str(e): raise InvalidAuthException else: raise RemoteErrorException(str(e))
def fetchHourly(request): conn = Connection(settings.SCRAPINGHUB_KEY) project = conn[65427] jobs = project.jobs(state='finished', has_tags='hourly', count=2) for job in jobs: saveItems(job.items()) return HttpResponse('success')
def fetch_hourly(): logger.info("Start task") conn = Connection(settings.SCRAPINGHUB_KEY) project = conn[65427] jobs = project.jobs(state='finished', has_tags='hourly', count=2) for job in jobs: saveItems(job.items())
def __init__(self, crawler): settings = crawler.settings self.hs_endpoint = settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(settings, "HS_AUTH") self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID')) self.hs_frontier = self._get_config(settings, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT") self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS) self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS) self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished']) conn = Connection(self.hs_auth) self.panel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(set) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig()
def get_last_24h_jobs(apikey, project_id): """Fetch jobs that finished in the last 24 hours """ project = Project(Connection(apikey), project_id) since_time = datetime.utcnow() - timedelta(hours=24) jobs = [ job for job in project.jobs(state='finished') if is_job_newer_than(job, since_time) ] return jobs
def schedule_spider(project, endpoint, apikey, spider, arguments=(), settings=()): conn = Connection(apikey, url=endpoint) try: return conn[project].schedule( spider, job_settings=json.dumps(dict(x.split('=', 1) for x in settings)), **dict(x.split('=', 1) for x in arguments) ) except APIError as e: raise RemoteErrorException(str(e))
def schedule_spider(project, endpoint, apikey, spider, arguments=(), settings=(), priority=DEFAULT_PRIORITY, units=None, tag=()): conn = Connection(apikey, url=endpoint) try: kw = dict(x.split('=', 1) for x in arguments) if units is not None: kw['units'] = units return conn[project].schedule( spider, job_settings=json.dumps(dict(x.split('=', 1) for x in settings)), priority=priority, add_tag=tag, **kw ) except APIError as e: raise RemoteErrorException(str(e))
class ConnectionTest(unittest.TestCase): def setUp(self): self.client = Connection('http://server/api/', 'john', 'doe') def test_initialization(self): self.assertEqual(self.client.url, 'http://server/api/') self.assertEqual(str(self.client), 'Connection(http://server/api/)') self.assertEqual(self.client._request_headers['Authorization'], 'Basic am9objpkb2U=') def test_project_names(self): content = json.dumps(dict(status='ok', projects=['foo', 'bar'])) with MockResponse(self.client, content) as mock: self.assertEqual(self.client.project_names(), ['foo', 'bar']) self.assertEqual(mock.url, 'http://server/api/scrapyd/listprojects.json') self.assertEqual(mock.data, None)
def __init__(self, crawler): self.crawler = crawler self.hs_endpoint = crawler.settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(crawler, "HS_CONSUME_FROM_SLOT") try: self.hs_number_of_slots = int( crawler.settings.get("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)) except ValueError: self.hs_number_of_slots = DEFAULT_HS_NUMBER_OF_SLOTS try: self.hs_max_links = int( crawler.settings.get("HS_MAX_LINKS", DEFAULT_MAX_LINKS)) except ValueError: self.hs_max_links = DEFAULT_MAX_LINKS self.hs_start_job_enabled = crawler.settings.get( "HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = crawler.settings.get( "HS_START_JOB_ON_REASON", ['finished']) self.hs_start_job_new_panel = crawler.settings.get( "HS_START_JOB_NEW_PANEL", False) if not self.hs_start_job_new_panel: conn = Connection(self.hs_auth) self.oldpanel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links_count = defaultdict(int) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig()
def handle(self, *args, **kwargs): conn = Connection('1a22b051feb8448fa71bb1cc2ea4aa9c') project = conn[62659] job_id = kwargs['job_id'] feed_type = kwargs['type'] processor_maps = { 'nsw': process_nsw, 'act': process_act, 'qld': process_qld, 'sa': process_sa, 'vic': process_vic, 'nt': process_nt, 'wa': process_wa, } processor = processor_maps[feed_type] if not processor: raise Exception('Unknown type {0}'.format(feed_type)) for item in project.job(job_id).items(): dict = processor(item) JusticeOfPeace.objects.create(**dict)
# !/usr/bin/python # -*- coding: utf-8 -*- import MySQLdb import sys from geopy.exc import GeocoderTimedOut sys.path.insert(0, 'C:\Users\home\greekbeachesapp') from sunthesis import elegxos_paralias from scrapinghub import Connection from geopy.geocoders import Nominatim import time conn = Connection('42e697f0d35348e9b6c70d6e74dcbc93') print conn print conn.project_ids() project = conn[78127] jobs = project.jobs(state='finished') jobs_id = [x.id for x in jobs] print jobs_id job = project.job(u'78127/1/1') source = 'thassos-view.com' servername = "localhost" username = "******" password = "" dbname = "database" db = MySQLdb.connect(servername, username, password, dbname, charset="utf8", use_unicode=True) cursor = db.cursor() for item in job.items():
# !/usr/bin/python # -*- coding: utf-8 -*- import MySQLdb import sys from geopy.exc import GeocoderTimedOut sys.path.insert(0, 'C:\Users\home\greekbeachesapp') from sunthesis import elegxos_paralias from scrapinghub import Connection from geopy.geocoders import Nominatim import time conn = Connection('57c54e0d05fb44439a9fc0887fb6ab3e') print conn print conn.project_ids() project = conn[77608] jobs = project.jobs(state='finished') jobs_id = [x.id for x in jobs] print jobs_id job = project.job(u'77608/1/1') source = 'e-zakynthos.com' servername = "localhost" username = "******" password = "" dbname = "database" db = MySQLdb.connect(servername, username, password, dbname, charset="utf8", use_unicode=True) cursor = db.cursor() z = u'\u0396\u03ac\u03ba\u03c5\u03bd\u03b8\u03bf\u03c2'
def main(args): project = Connection(args.apikey)[args.project_id] items = fetch_latest_job_items(project) for it in rank_items(items, args.top): print(it)
def test_connection_init_with_default_url(): conn = Connection(apikey='testkey') assert conn.url == Connection.DEFAULT_ENDPOINT
def test_connection_init_assert_apikey_not_url(): with pytest.raises(AssertionError): Connection(password='******', apikey='http://some-url')
def test_connection_init_use_key_from_env(): conn = Connection() assert conn.apikey == 'testkey'
def test_connection_init_fail_wo_apikey(monkeypatch): monkeypatch.delenv('SH_APIKEY', raising=False) with pytest.raises(RuntimeError): Connection()
def setUp(self): self.client = Connection('http://server/api/', 'john', 'doe')
def test_connection_init_with_custom_timeout(): conn = Connection(apikey='testkey', connection_timeout=60) assert conn._connection_timeout == 60
def test_connection_init_with_default_timeout(): conn = Connection(apikey='testkey') assert conn._connection_timeout is None
def connection(): return Connection(apikey='testkey', url='http://test-url')