def schedule_spider(project, endpoint, apikey, spider, arguments=(), settings=(), priority=DEFAULT_PRIORITY, units=None, tag=(), environment=()): client = ScrapinghubClient(apikey, dash_endpoint=endpoint) try: project = client.get_project(project) args = dict(x.split('=', 1) for x in arguments) cmd_args = args.pop('cmd_args', None) meta = args.pop('meta', None) job = project.jobs.run( spider=spider, meta=json.loads(meta) if meta else {}, cmd_args=cmd_args, job_args=args, job_settings=dict(x.split('=', 1) for x in settings), priority=priority, units=units, add_tag=tag, environment=dict(x.split('=', 1) for x in environment), ) return job.key except ScrapinghubAPIError as e: raise RemoteErrorException(str(e))
def get_last_job_ids(self): project_id = os.environ.get("SCRAPY_PROJECT_ID") api_key = self.spider.settings.get("SCRAPINGHUB_API_KEY") if not project_id or not api_key: return [] client = ScrapinghubClient(api_key) project = client.get_project(project_id) jobs = project.jobs.list() if not jobs: return [] # find last job for spider searchterm same spider # can be invoked with different searchterms last_matching_job = None for each in jobs: key = each["key"] job = client.get_job(key) metadata = dict(job.metadata.list()) searchterm = metadata.get("spider_args", {}).get("searchterm", "") if self.spider.searchterm == searchterm: last_matching_job = job break if not last_matching_job: return [] return [item["id"] for item in last_matching_job.items.iter()]
def __init__(self, collection_name, project_id=None, apikey=None, autodetect_partitions=True): """ collection_name - target collection project_id - target project id apikey - hubstorage apikey with access to given project. If None, delegate to scrapinghub lib. autodetect_partitions - If provided, autodetect partitioned collection. By default is True. If you want instead to force to read a non-partitioned collection when partitioned version also exists under the same name, use False. """ self.hsc = ScrapinghubClient(apikey)._hsclient project_id = project_id or get_project_id() self.hsp = self.hsc.get_project(project_id) num_partitions = None if autodetect_partitions: num_partitions = get_num_partitions(self.hsp, collection_name) if num_partitions: log.info( "Partitioned collection detected: %d total partitions.", num_partitions) self.collections = [] if num_partitions: for p in range(num_partitions): self.collections.append( self.hsp.collections.new_store("{}_{}".format( collection_name, p))) else: self.collections.append( self.hsp.collections.new_store(collection_name))
def getDataXoso(): # Enter ScrapingHub apikey = '40f9881d52794d7bb09b9f5ee6d12a3e' # your API key as a string client = ScrapinghubClient(apikey) projectID = 410647 project = client.get_project(projectID) # get spider spiderID = 'quotes' spider = project.spiders.get(spiderID) jobs_summary = spider.jobs.iter() job_keys = [j['key'] for j in jobs_summary] print(job_keys) result = [] for job_key in job_keys: job = project.jobs.get(job_key) # Check to see if the job was completed if job.metadata.get(u'close_reason') == u'finished': for item in job.items.iter(): result.append(item) return result
def create_json_schema(source_key: str, item_numbers: List[int] = None) -> dict: client = ScrapinghubClient() if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = client.get_job(source_key) items_count = api.get_items_count(job) store = job.items else: logger.error(f"{source_key} is not a job or collection key") return if items_count == 0: logger.error(f"{source_key} does not have any items") return item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: logger.error(item_n_err.format(item_numbers[-1], items_count - 1)) return else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1) samples.append(items[0]) return infer_schema(samples)
def __init__(self): self.workflow_loop_enabled = False self.args = self.parse_args() self.client = ScrapinghubClient(self.args.apikey) self.project_id = resolve_project_id(self.args.project_id or self.project_id) if not self.project_id: self.argparser.error('Project id not provided.')
def ready(self): global test apikey = '88133cc793ab4296b56db8a87eaae1ec' client = ScrapinghubClient(apikey) test = client.get_job('223795/1/3') test = sorted(test.items.list(), key=lambda k: k['score'], reverse=True)
def __init__(self, auth, project_id, frontier, batch_size=0): self._client = ScrapinghubClient(auth=auth) self._hcf = self._client.get_project(project_id).frontiers self._frontier = self._hcf.get(frontier) self._links_count = defaultdict(int) self._links_to_flush_count = defaultdict(int) self._batch_size = batch_size self._hcf_retries = 10
def __init__(self): self.apikey = '' # your API key as a string self.client = ScrapinghubClient(self.apikey) self.project_num = 0 self.project = self.client.get_project(self.project_num) self.neighborhood_spider = self.get_neighborhood_spider() self.listing_spider = self.get_listing_spider() self.airdna_spider = self.get_airdna_spider()
def __init__(self, input_uri, settings): super().__init__(settings) client = ScrapinghubClient() jobkey = parse_job_key(os.environ['SHUB_JOBKEY']) project = client.get_project(jobkey.project_id) collection_name = input_uri.replace('collections://', '') self._store = project.collections.get_store(collection_name)
def __init__(self, crawl_url): """ Initialize and build a connection with Scrapinghub via its api """ self._client = ScrapinghubClient(settings.SCRAPINGHUB_APIKEY) # TODO: need to be revised self._project_id = self._client.projects.list()[0] self._project = self._client.get_project(self._project_id) self._target = crawl_url self._job = None self._meta = None self._state = 'initialized'
def jobRuning00(): # Enter ScrapingHub # Enter ScrapingHub apikey = '40f9881d52794d7bb09b9f5ee6d12a3e' # your API key as a string client = ScrapinghubClient(apikey) projectID = 410647 project = client.get_project(projectID) # get spider spiderID = 'quotes' spider = project.spiders.get(spiderID) spider.jobs.run()
def showBooks(request): global job job = test if job is None: print("adgaegae") apikey = '88133cc793ab4296b56db8a87eaae1ec' client = ScrapinghubClient(apikey) job = client.get_job('223795/1/3') job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True) return render(request, 'user_page.html', {'spider_books': job, 'user_fullname':request.user.get_full_name,'myuser_id':request.user.myuser.id}) else: '''job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True)''' return render(request, 'user_page.html',{'spider_books': job, 'user_fullname': request.user.get_full_name,'myuser_id': request.user.myuser.id})
def __init__(self, crawler): settings = crawler.settings coll_name = settings.get('TARGET_COLLECTION_NAME') coll_type = settings.get('TARGET_COLLECTION_TYPE', 's') if not coll_name or not coll_type: raise NotConfigured('Please set target collection settings.') current_project_id = os.environ.get('SCRAPY_PROJECT_ID') project_id = settings.get('HCF_PROJECT_ID', current_project_id) self.logger = logging.getLogger(__name__) # if auth is not set explicitly, fallback to SH job-level token self.client = ScrapinghubClient(settings.get('HCF_AUTH')) self.project = self.client.get_project(project_id) self.collection = self.project.collections.get(coll_type, coll_name)
def main(): args = parse_args() apikey = os.environ.get('SH_APIKEY') or args.apikey if not apikey: print('Please set API key') exit(1) client = ScrapinghubClient(apikey) job = client.get_job(args.job) events = args.func(job) if args.command == 'errors': report_errors = create_errors_report(events, max_urls_for_output=(min( args.max, 30))) print(report_errors)
def menu(): client = ScrapinghubClient(config['scrapinghub']['api_key']) project = client.get_project(config['scrapinghub']['project_id']) job = project.jobs.list(spider=config['scrapinghub']['spider_name'], state='finished', count=1)[0] job = client.get_job(job['key']) menu = {} menu['aktualnosc'] = job.metadata.get('finished_time') menu['restauracja'] = { "nazwa": "CamelPizza", "logo": "https://www.camelpizza.pl/system/logos/27323/menu_size/1549450693.png", "url": "http://camelpizza.pl" } menu['grupy'] = [] def get_grupa(item): for grupa in menu['grupy']: if grupa['nazwa'] == item['grupa']: return grupa grupa = { 'nazwa': item['grupa'], 'pozycje': [] } menu['grupy'].append(grupa) return grupa def get_pozycja(item): grupa = get_grupa(item) for pozycja in grupa['pozycje']: if pozycja['nazwa'] == item['pozycja']: return pozycja pozycja = { 'nazwa': item['pozycja'], 'opis': item['opis'], 'warianty': [] } grupa['pozycje'].append(pozycja) return pozycja def get_cena(item): kwota, waluta = item['cena'].replace(u'zł', u' zł').split() kwota = float(kwota.replace(',', '.')) waluta = waluta.replace(u'zł', 'PLN') return { 'kwota': kwota, 'waluta': waluta } items = job.items.list() for item in items: try: pozycja = get_pozycja(item) wariant = { 'opis': item['wariant'], 'ceny': [ get_cena(item) ]} pozycja['warianty'].append(wariant) except: print("Invalid item") return jsonify(menu)
class PttCrawlerJob(): def __init__(self, crawl_url): """ Initialize and build a connection with Scrapinghub via its api """ self._client = ScrapinghubClient(settings.SCRAPINGHUB_APIKEY) # TODO: need to be revised self._project_id = self._client.projects.list()[0] self._project = self._client.get_project(self._project_id) self._target = crawl_url self._job = None self._meta = None self._state = 'initialized' def run(self): """ Run the crawler (spider) """ if not self._job: self._job = self._project.jobs.run( 'ptt', job_args={'test_url': self._target}) return self._job.key else: return None def update_meta(self): """ Update job's meta data """ if self._job: self._meta = dict(self._job.metadata.iter()) self._state = self._meta['state'] def cancle(self): """ Cancle a job """ self._job.cancel() @property def meta(self): """ Get job's meta data """ if self._meta: return self._meta else: return None @property def state(self): """ Get job's current state """ return self._state @property def item(self): """ Get scrapped items """ if self._state == 'finished': # items.iter() returns a iterable, but is not a list and does not support indexing # so it has to be transformed into a list. Each element in the list is a dict. return list(self._job.items.iter()) else: return None
def job(self) -> Job: if not self._job: job = ScrapinghubClient().get_job(self.key) if job.metadata.get("state") == "deleted": raise ValueError(f"{self.key} has 'deleted' state") self._job = job return self._job
class ScrapyCloudClient: def __init__(self): self.apikey = '' # your API key as a string self.client = ScrapinghubClient(self.apikey) self.project_num = 0 self.project = self.client.get_project(self.project_num) self.neighborhood_spider = self.get_neighborhood_spider() self.listing_spider = self.get_listing_spider() self.airdna_spider = self.get_airdna_spider() def get_neighborhood_spider(self): return ScrapyCloudNeighborhoodSearchSpider( self.project.spiders.get('neighborhood_search')) def get_listing_spider(self): return ScrapyCloudSpider(self.project.spiders.get('listing')) def get_airdna_spider(self): return ScrapyCloudSpider(self.project.spiders.get('airdna')) def listing_ids(self): all_ids = self.neighborhood_spider.get_listing_ids() print(len(all_ids)) id_string = "" for num, i in enumerate(all_ids): if num == 0: id_string = str(i) else: id_string = id_string + "," + str(i) return id_string
def shub_conn(): # don't use default `.get()` property because then it will evaluate # `settings.SH_API_KEY` anyway and you might not have setup it locally api_key = os.environ.get('SH_API_KEY') or Settings().get('SH_API_KEY') # NOTE not really safe when `name` doesn't exist return ScrapinghubClient(api_key)
class HcfCrawlerPipeline(object): def __init__(self, crawler): settings = crawler.settings coll_name = settings.get('TARGET_COLLECTION_NAME') coll_type = settings.get('TARGET_COLLECTION_TYPE', 's') if not coll_name or not coll_type: raise NotConfigured('Please set target collection settings.') current_project_id = os.environ.get('SCRAPY_PROJECT_ID') project_id = settings.get('HCF_PROJECT_ID', current_project_id) self.logger = logging.getLogger(__name__) # if auth is not set explicitly, fallback to SH job-level token self.client = ScrapinghubClient(settings.get('HCF_AUTH')) self.project = self.client.get_project(project_id) self.collection = self.project.collections.get(coll_type, coll_name) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_item(self, item, spider): item_to_export = dict(item) if not '_key' in item_to_export: fp = hashlib.sha1() fp.update(canonicalize_url(item['url']).encode('utf8')) item_to_export['_key'] = fp.hexdigest() self.collection.set(item_to_export) return item
def start_requests(self): #self.c.execute('DROP TABLE IF EXISTS iteminfos') #self.c.execute('CREATE TABLE IF NOT EXISTS iteminfos (item_id, item_main_type, item_mid_type, item_sub_type, item_price, area_id)') #self.conn.commit() #temp_conn = sql.connect('dataset/area.db') #temp_c = temp_conn.cursor() #temp_c.execute('DELETE FROM areainfos WHERE rowid NOT IN (SELECT min(rowid) FROM areainfos GROUP BY item_id,item_url,area_id)') #item_list = [row for row in temp_c.execute('SELECT * FROM areainfos ORDER BY area_id')] #temp_conn.commit() client = ScrapinghubClient('ec16b94bcf024d0bb502684368658d59') myproject = client.projects.get('254951') mystore = myproject.collections.get_store('area_info') value_num = mystore.count() #for item in item_list: for item in range(value_num): #url = item[1].encode() log_item = mystore.get(str(item)) area_id = log_item['value']['area_id'][0] item_ids = log_item['value']['item_id'] item_urls = log_item['value']['item_url'] for i in range(len(item_ids)): if i % 10 == 0: sleep(0.8) #request = scrapy.Request(url=url,callback=self.parse) #request.meta['item_id'] = item[0] #request.meta['area_id'] = item[2] request = scrapy.Request(url=item_urls[i], callback=self.parse, errback=self.error_handler) request.meta['item_id'] = item_ids[i] request.meta['area_id'] = area_id yield request
def create_json_schema(source_key: str, item_numbers: Optional[List[int]] = None) -> dict: if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = ScrapinghubClient().get_job(source_key) items_count = api.get_items_count(job) else: raise ValueError( f"'{source_key}' is not a valid job or collection key") if items_count == 0: raise ValueError(f"'{source_key}' does not have any items") item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: raise ValueError( item_n_err.format(item_numbers[-1], items_count - 1)) else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1, p_bar=None) samples.append(items[0]) return infer_schema(samples)
def __init__(self, crawler): self.crawler = crawler settings = crawler.settings current_project_id = os.environ.get('SCRAPY_PROJECT_ID') project_id = settings.get('HCF_PROJECT_ID', current_project_id) frontier_name = settings.get('HCF_FRONTIER') frontier_slot = settings.get('HCF_FRONTIER_SLOT') if not project_id or not frontier_name or not frontier_slot: raise NotConfigured('Please set HCF settings for the middleware.') self.batch_size = settings.getint('HCF_BATCH_SIZE', DEFAULT_BATCH_SIZE) self.logger = logging.getLogger(__name__) # if auth is not set explicitly, fallback to SH job-level token self.client = ScrapinghubClient(settings.get('HCF_AUTH')) self.project = self.client.get_project(project_id) self.frontier = self.project.frontiers.get(frontier_name) self.frontier_slot = self.frontier.get(frontier_slot)
def test_projects_list(client): projects = client.projects.list() assert client.projects.list() == [] # use user apikey to list test projects client = ScrapinghubClient(TEST_USER_AUTH, TEST_DASH_ENDPOINT) projects = client.projects.list() assert isinstance(projects, list) assert int(TEST_PROJECT_ID) in projects
def index(): apikey = os.environ.get("APIKEY") job_id = os.environ.get("JOB_ID") client = ScrapinghubClient(apikey) job = client.get_job(job_id) data = [] for item in job.items.iter(): dict = { 'title': item['title'][0], 'director': item['director'][0], 'summary': item['summary'][0] } data.append(dict) return render_template('index.html', data=data)
def __init__(self): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('pid', help='Taget project id') parser.add_argument('spider', help='Spider name') parser.add_argument('frontier', help='Frontier name') parser.add_argument('prefix', help='Slot prefix') parser.add_argument('--max-jobs', help='Max number of jobs for the given spider allowed to run in parallel.\ Default is %(default)s.', type=int, default=1) parser.add_argument('--apikey', help='API key to use for HCF access. Uses SH_APIKEY environment variable if not given') parser.add_argument('--spider-args', help='Spider arguments dict in json format', default='{}') parser.add_argument('--loop-mode', help='If provided, manager will run in loop mode, with a cycle each given\ number of seconds.', type=int, metavar='SECONDS') self.args = parser.parse_args() client = ScrapinghubClient(self.args.apikey) self.project = client.get_project(self.args.pid) self.hcfpal = HCFPal(client._hsclient.get_project(self.args.pid))
def obtainLatestJobIDofSpider(apikey, project_id, spider_id): client = ScrapinghubClient(apikey) myproject = client.projects.get(project_id) job_keys = [_['key'] for _ in myproject.jobs.iter()] job_ids = [ int(_.split('/')[2]) if (_.split('/')[0] == myproject.key and _.split('/')[1] == str(spider_id)) else '' for _ in job_keys ] job_ids = [_ if type(_) is int else 0 for _ in job_ids] return int(sorted(job_ids)[-1])
def has_project_access(project, endpoint, apikey): """Check whether an API key has access to a given project. May raise InvalidAuthException if the API key is invalid (but not if it is valid but lacks access to the project)""" client = ScrapinghubClient(apikey, dash_endpoint=endpoint) try: return project in client.projects.list() except ScrapinghubAPIError as e: if 'Authentication failed' in str(e): raise InvalidAuthException else: raise RemoteErrorException(str(e))
def restore(spider_id, job_id=0, store_name='', *keys): APIKEY = 'ec16b94bcf024d0bb502684368658d59' PROJECTID = '254951' SPIDERID = spider_id client = ScrapinghubClient(APIKEY) myproject = client.get_project(PROJECTID) if job_id == 0: myjob_id = obtainLatestJobIDofSpider(APIKEY, PROJECTID, SPIDERID) #job_keys = [_['key'] for _ in myproject.jobs.iter()] #job_ids = [int(_.split('/')[2]) if (_.split('/')[0]==myproject.key and _.split('/')[1]==SPIDERID) else '' for _ in job_keys] #myjob_id = sorted(job_ids)[-1] else: myjob_id = job_id myjob = myproject.jobs.get('%s/%s/%d' % (PROJECTID, SPIDERID, myjob_id)) myitem = [_ for _ in myjob.items.iter()] item_num = len(myitem) item_container = dict() for key_i in keys: item_container[key_i] = [_[key_i] for _ in myitem] #area_ids = [_['area_id'] for _ in myitem] #item_ids = [_['item_id'] for _ in myitem] #item_urls = [_['item_url'] for _ in myitem] store_names = [_['name'] for _ in myproject.collections.iter()] if store_name in store_names: mycollection = myproject.collections.get_store(store_name) if mycollection.count() > 0: for _ in mycollection.iter(): mycollection.delete(_['_key']) for item_i in range(item_num): area_info_item = dict() for key_i in keys: area_info_item[key_i] = item_container[key_i][item_i] #area_info_item['area_id'] = area_ids[item_i] #area_info_item['item_id'] = item_ids[item_i] #area_info_item['item_url'] = item_urls[item_i] mycollection.set({'_key': str(item_i), 'value': area_info_item}) else: print "the collection %s you want to access is not exist." % store_name