Beispiel #1
0
def get_job(job):
    jobid, apikey = get_job_specs(job)
    hsc = HubstorageClient(auth=apikey)
    job = hsc.get_job(jobid)
    if not job.metadata:
        raise NotFoundException('Job {} does not exist'.format(jobid))
    return job
def hsclient_with_retries(max_retries=3, max_retry_time=1):
    return HubstorageClient(
        auth=TEST_AUTH,
        endpoint=TEST_ENDPOINT,
        max_retries=max_retries,
        max_retry_time=max_retry_time,
    )
Beispiel #3
0
    def __init__(self, crawler):
        settings = crawler.settings
        self.hs_endpoint = settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(settings, "HS_AUTH")
        self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID'))
        self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
        self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
        self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
        self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])

        conn = Connection(self.hs_auth)
        self.panel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(set)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()
        self.logger = logging.getLogger("HCF")
Beispiel #4
0
def _run_scraper(jobkey, jobauth, close_reason=None):
    httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
    # Scraper - uses job level auth, no global or project auth available
    client = HubstorageClient(endpoint=TEST_ENDPOINT)
    # use some fixed timestamp to represent current time
    now_ts = 1476803148638
    with closing(client) as scraperclient:
        job = scraperclient.get_job(jobkey, auth=jobauth)
        for idx in range(MAGICN):
            iid = job.items.write({'uuid': idx})
            job.logs.debug('log debug %s' % idx, idx=idx)
            job.logs.info('log info %s' % idx, idx=idx)
            job.logs.warn('log warn %s' % idx, idx=idx)
            job.logs.error('log error %s' % idx, idx=idx)
            sid = job.samples.write([idx, idx, idx])
            rid = job.requests.add(
                url='http://test.com/%d' % idx,
                status=random.randint(100, 1000),
                method=random.choice(httpmethods),
                rs=random.randint(0, 100000),
                duration=random.randint(0, 1000),
                parent=random.randrange(0, idx + 1) if idx > 10 else None,
                ts=now_ts + 100 + idx,
            )
            assert iid == idx
            assert sid == idx
            assert rid == idx

        if isinstance(close_reason, Exception):
            raise close_reason

        if close_reason:
            job.metadata['close_reason'] = close_reason

        job.metadata.save()
Beispiel #5
0
def get_scraped_data(dir,project_id, scrapinghub_key, spider):
    # establish a connection with scrapyhub and get a items generator
    hc = HubstorageClient(auth=scrapinghub_key)
    print project_id
    empty, totalItems, keptItems = 0, 0, 0
    for job in hc.get_project(project_id).jobq.list(spider=spider):
        for item in hc.get_job(job['key']).items.list():
	    print item		
            totalItems += 1
            item = pd.Series(item)
            if item['title'] != '' and item['article'] != '' and \
                            item['title'] != ' ' and item['article'] != ' ':
                item['spider'] = spider
                item = item.drop('category')
                item = item.replace(["page1", "page2", "page3", "scrape_time", "", "basic"],
                                    [np.nan, np.nan, np.nan, np.nan, np.nan, "reutersbasic"])
                item = item.replace({'<.*?>': '', '\[.*?\]': '', '\(.*?\)': ''}, regex=True)

                #add article hash code as the id of the article
                item['id'] = hash(item['article'])

                #write item(as records) to a json file
                file = dir + str(item['id']) + '.json'
#		f = open(file,'w')
                item.to_json(file)

                keptItems += 1

            else:
                empty += 1

    print '#' * 50
    print 'Fetched: ', totalItems, ' from spider: ', item['spider']
    print keptItems, ' were written to the folder'
    print '-' * 50, '\n\n'
 def open_spider(self, spider):
     client = HubstorageClient(auth=settings.SHUB_KEY)
     project = client.get_project(settings.SHUB_PROJ_ID)
     self.data_stores = {}
     for product_name in get_product_names():
         self.data_stores[product_name] = project.collections.new_store(
             product_name)
Beispiel #7
0
 def client(self):
     from scrapinghub import HubstorageClient
     if self._client is None:
         user_agent = os.environ.get('SHUB_HS_USER_AGENT')
         self._client = HubstorageClient(endpoint=self.endpoint,
                                         auth=self.auth,
                                         user_agent=user_agent)
     return self._client
Beispiel #8
0
class DiscoveryProcessorMixin(object):

    def get_previous_job(self, attr):
        if not hasattr(self, attr):
            raise AttributeError(
                'You should specify a {attr} argument to the job'.format(
                    attr=attr
                )
            )

        job_id = getattr(self, attr)
        auth = self.crawler.settings.get('SCRAPINGHUB_APIKEY')
        hc = HubstorageClient(auth=auth)
        return hc.get_job(job_id)
Beispiel #9
0
def _run_runner(hsproject, pushed, close_reason):
    client = HubstorageClient(endpoint=TEST_ENDPOINT, auth=TEST_AUTH)
    with closing(client) as runnerclient:
        job = start_job(hsproject)
        assert not job.metadata.get('stop_requested')
        job.metadata.update(host='localhost', slot=1)
        assert job.metadata.get('state') == 'running'
        # run scraper
        try:
            _run_scraper(job.key, job.jobauth, close_reason=close_reason)
        except Exception as exc:
            job.logs.error(message=str(exc), appendmode=True)
            job.close_writers()
            job.jobq.finish(job, close_reason='failed')
            # logging from runner must append and never remove messages logged
            # by scraper
            assert job.logs.batch_append
        else:
            job.jobq.finish(job, close_reason=close_reason or 'no_reason')
def test_auth(hsclient, json_and_msgpack):
    # client without global auth set
    hsc = HubstorageClient(endpoint=hsclient.endpoint,
                           use_msgpack=hsclient.use_msgpack)
    assert hsc.auth is None

    # check no-auth access
    try:
        hsc.push_job(TEST_PROJECT_ID, TEST_SPIDER_NAME)
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_project(TEST_PROJECT_ID).push_job(TEST_SPIDER_NAME)
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_job((TEST_PROJECT_ID, 1, 1)).items.list()
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_project(TEST_PROJECT_ID).get_job(
            (TEST_PROJECT_ID, 1, 1)).items.list()
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    # create project with auth
    auth = hsclient.auth
    project = hsc.get_project(TEST_PROJECT_ID, auth)
    assert project.auth == auth
    job = project.push_job(TEST_SPIDER_NAME)
    samejob = project.get_job(job.key)
    assert samejob.key == job.key
Beispiel #11
0
def test_custom_ua():
    client = HubstorageClient(auth=TEST_AUTH,
                              endpoint=TEST_ENDPOINT,
                              user_agent='testUA')
    assert client.user_agent == 'testUA'
Beispiel #12
0
 def setUpClass(cls):
     cls.endpoint = HS_ENDPOINT
     cls.auth = HS_AUTH
     cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint)
     cls.project = cls.hsclient.get_project(cls.projectid)
     cls.fclient = cls.project.frontier
Beispiel #13
0
def hsclient():
    return HubstorageClient(auth=TEST_AUTH, endpoint=TEST_ENDPOINT)
Beispiel #14
0
def panelclient():
    # Panel - no client auth, only project auth using user auth token
    return HubstorageClient(endpoint=TEST_ENDPOINT)
Beispiel #15
0
 def client(self):
     if self._client is None:
         self._client = HubstorageClient(endpoint=self.endpoint,
                                         auth=self.auth)
     return self._client
Beispiel #16
0
    def client(self):
        from scrapinghub import HubstorageClient

        if self._client is None:
            self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
        return self._client