Beispiel #1
0
def _pages_update(context, data_dict):
    if db.pages_table is None:
        db.init_db(context['model'])
    org_id = data_dict.get('org_id')
    page = data_dict.get('page')
    # we need the page in the context for name validation
    context['page'] = page
    context['group_id'] = org_id

    data, errors = df.validate(data_dict, schema, context)

    if errors:
        raise p.toolkit.ValidationError(errors)

    out = db.Page.get(group_id=org_id, name=page)
    if not out:
        out = db.Page()
        out.group_id = org_id
        out.name = page
    items = ['title', 'content', 'name', 'private', 'order']
    for item in items:
        setattr(out, item, data.get(item))

    out.modified = datetime.datetime.utcnow()
    out.user_id = p.toolkit.c.userobj.id
    out.save()
    session = context['session']
    session.add(out)
    session.commit()
    def read_sample(self, response):
        '''detect sample page'''

        now = datetime.datetime.utcnow()
        client_number = response.meta['data-sclabs-client-number']

        self.logger.debug('%d: Reading sample "%s".', client_number,
                          response.url)

        page = self.session.query(
            db.Page).filter(db.Page.source_url == response.url).first()
        if page:
            self.logger.debug('%d: Sample page already exists in DB.',
                              client_number)
            page.fetched_at = now
        else:
            self.logger.debug('%d: Sample page does not exist in DB yet.',
                              client_number)
            page = db.Page(source_url=response.url,
                           status=response.status,
                           fetched_at=now)
        self.session.commit()

        xpath_savepage_loaded_match = response.xpath(
            self.xpath_savepage_loaded)
        # if xpath_savepage_loaded_match:
        # 	page.status = response.status
        # 	self.save_this(response)
        # else:
        # 	self.log('PAGE {} WILL NOT BE SAVED!'.format(response.url))
        # 	page.status = -404
        page.status = response.status
        self.session.commit()
        self.save_this(response)
Beispiel #3
0
def _pages_update(context, data_dict):
    if db.pages_table is None:
        db.init_db(context['model'])
    org_id = data_dict.get('org_id')
    page = data_dict.get('page')
    # we need the page in the context for name validation
    context['page'] = page
    context['group_id'] = org_id

    data, errors = df.validate(data_dict, schema, context)

    if errors:
        raise p.toolkit.ValidationError(errors)

    out = db.Page.get(group_id=org_id, name=page)
    if not out:
        out = db.Page()
        out.group_id = org_id
        out.name = page
    items = [
        'title',
        'content',
        'name',
        'private',
        'order',
        'page_type',
        'publish_date',
        'image_url',
    ]
    for item in items:
        # backward compatible with older version where page_type does not exist
        setattr(out, item,
                data.get(item, 'page' if item == 'page_type' else None))

    extras = {}
    extra_keys = set(schema.keys()) - set(items + ['id', 'created'])
    for key in extra_keys:
        if key in data:
            extras[key] = data.get(key)
    out.extras = json.dumps(extras)

    out.modified = datetime.datetime.utcnow()
    out.user_id = p.toolkit.c.userobj.id
    out.save()
    session = context['session']
    session.add(out)
    session.commit()
    def save_this(self, response):
        '''Save the given page'''
        client_number = response.meta['data-sclabs-client-number']

        self.logger.debug('%d: Saving page from URL "%s".', client_number,
                          response.url)

        counter = 1
        parsedURL = urllib.parse.urlparse(response.url)
        re_sampleURLPath_match = re_sampleURLPath.match(parsedURL.path)
        if re_sampleURLPath_match:
            sample_id = re_sampleURLPath_match.group('sample_id')
            self.logger.debug('%d: Got sample ID "%s".', client_number,
                              sample_id)
        else:
            sample_id = 'NOID'
            self.logger.debug('%d: No sample ID found.', client_number)
        filename = '{}.html'.format(sample_id)
        filepath = os.path.join(SAVE_FOLDER, filename)
        while os.path.exists(filepath) or self.session.query(
                db.Page).filter(db.Page.file == filename).first():
            counter += 1
            filename = '{}_{}.html'.format(sample_id, counter)
            filepath = os.path.join(SAVE_FOLDER, filename)
        with open(filepath, 'wb') as f:
            f.write(response.body)
        page = self.session.query(
            db.Page).filter(db.Page.source_url == response.url).first()
        if page:
            self.logger.debug('%d: Sample page already exists in DB.',
                              client_number)
            page.file = filename
        else:
            self.logger.debug('%d: Sample page not existing in DB yet.',
                              client_number)
            page = db.Page(source_url=response.url, file=filename)
            self.session.add(page)
        self.session.commit()
        self.logger.debug('%d: Saved page "%s" under file "%s".',
                          client_number, response.url, filepath)
    def start_requests(self):
        '''Generates the client links'''
        self.logger.debug('Launching initial requests . . .')

        for client_number in range(7600, -1, -1):  #TODO
            self.logger.debug('%d: Building request for client.',
                              client_number)
            wanted_link = self.template_resultspage.format(
                client_number=client_number, page=1)
            request = scrapy.Request(url=wanted_link,
                                     callback=self.read_client,
                                     errback=self.errback)
            request.meta['data-sclabs-client-number'] = client_number
            request.meta['data-sclabs-client-lastknown-maxpage'] = 10000
            request.meta['data-sclabs-client-currentpage'] = 1
            self.logger.debug('%d: Setting currentpage counter to "%d".',
                              client_number,
                              request.meta['data-sclabs-client-currentpage'])
            request.meta['data-sclabs-NTterpenes-counter'] = 0
            self.logger.debug('%d: Setting NTterpenes counter to "%d".',
                              client_number,
                              request.meta['data-sclabs-NTterpenes-counter'])
            request.meta['dont_cache'] = True
            page = self.session.query(
                db.Page).filter(db.Page.source_url == wanted_link).first()
            if page:
                self.logger.debug('%d: Page already in database.',
                                  client_number)
                page.online_database_id = self.lab_id
            else:
                self.logger.debug('%d: Page not in database yet.',
                                  client_number)
                page = db.Page(source_url=wanted_link,
                               online_database_id=self.lab_id)
                self.session.add(page)
            self.session.commit()
            self.logger.debug('%d: Yielding request.', client_number)
            yield request
    def errback(self, failure):
        '''Logs failed pages'''
        now = datetime.datetime.utcnow()
        source_url = failure.value.response.url
        status_code = failure.value.response.status
        self.logger.warning('Detected error while requesting page "%s".',
                            source_url)

        if failure.check(scrapy.spidermiddlewares.httperror.HttpError):
            page = self.session.query(
                db.Page).filter(db.Page.source_url == source_url).first()
            if page:
                self.logger.debug('Page already exists in DB.')
                page.status = status_code
                page.fetched_at = now
                page.online_database_id = self.lab_id
            else:
                self.logger.debug('Page not existing in DB yet.')
                page = db.Page(source_url=source_url,
                               status=status_code,
                               fetched_at=now,
                               online_database_id=self.lab_id)
                self.session.add(page)
            self.session.commit()
    def read_client(self, response):
        '''flip through client pages'''
        now = datetime.datetime.utcnow()
        client_number = response.meta['data-sclabs-client-number']
        no_terpenes_counter = response.meta['data-sclabs-NTterpenes-counter']
        page = self.session.query(
            db.Page).filter(db.Page.source_url == response.url).first()
        page.status = self.real_statusCode(response)
        page.fetched_at = now
        page.online_database_id = self.lab_id
        self.session.commit()

        self.logger.debug('%d: reading client, NTterpenes is "%d".',
                          client_number, no_terpenes_counter)

        for elem_sample in response.xpath(self.xpath_samples):
            relative_sample_link = elem_sample.xpath(self.xpath_savepages)[0]
            sample_link = response.urljoin(relative_sample_link.attrib['href'])
            if no_terpenes_counter >= MAX_NO_CONSECUTIVE_TERPENES:
                self.logger.debug(
                    '%d consecutive samples without terpene tests, aborting page.',
                    no_terpenes_counter)
                break
            raw_totalTerpenes = elem_sample.xpath(
                self.xpath_samplepreview_totalterpenesNT)
            should_crawl = True
            if len(raw_totalTerpenes) > 0:
                re_totalTerpenesNT_match = re_totalTerpenesNT.match(
                    raw_totalTerpenes[0].extract())
                if re_totalTerpenesNT_match:
                    no_terpenes_counter += 1
                    should_crawl = False
                    self.logger.debug(
                        'Terpene test not present, will not crawl sample "%s", counter is now "%d".',
                        sample_link, no_terpenes_counter)
                else:
                    no_terpenes_counter = 0
                    should_crawl = True
                    self.logger.debug(
                        'Terpene test present, will crawl sample "%s", counter has been reset.',
                        sample_link)
            if should_crawl:
                self.logger.debug('%d: Building request for sample "%s".',
                                  client_number, sample_link)
                request = scrapy_splash.SplashRequest(
                    url=sample_link,
                    callback=self.read_sample,
                    args=splash_args,
                    errback=self.errback)
                request.meta['data-sclabs-client-number'] = client_number
                page = self.session.query(
                    db.Page).filter(db.Page.source_url == sample_link).first()
                if page:
                    self.logger.debug('%d: Sample page already in DB.',
                                      client_number)
                    page.prev_url = response.url
                    page.online_database_id = self.lab_id
                else:
                    self.logger.debug('%d: Sample page not in DB yet.',
                                      client_number)
                    page = db.Page(source_url=sample_link,
                                   prev_url=response.url,
                                   online_database_id=self.lab_id)
                    self.session.add(page)
                self.session.commit()
                if page.file:
                    self.logger.debug(
                        '%d: A file is already associated with sample page of URL "%s", skipping.',
                        client_number, sample_link)
                else:
                    self.logger.debug(
                        '%d: No file yet associated with sample page of URL "%s", yielding request.',
                        client_number, sample_link)
                    yield request

        self.logger.debug(
            '%d: Now I am determining if I should crawl the next client page.',
            client_number)
        if no_terpenes_counter < MAX_NO_CONSECUTIVE_TERPENES:
            self.logger.debug(
                '%d: NTterpenes counter is "%d" below limit "%d".',
                client_number, no_terpenes_counter,
                MAX_NO_CONSECUTIVE_TERPENES)
            self.logger.debug('%d: Determining current page', client_number)
            currentpage_match = response.xpath(
                self.xpath_resultspage_currentpage)
            if currentpage_match:
                try:
                    currentpage_num = int(currentpage_match[0].extract())
                except ValueError:
                    currentpage_num = None

            if (currentpage_num == None) or (not currentpage_match):
                currentpage_num = response.meta[
                    'data-sclabs-client-currentpage']
            self.logger.debug('%d: Current page seems to be "%d".',
                              client_number, currentpage_num)

            self.logger.debug('%d: Determining last (maximum) page of client.',
                              client_number)
            maxpage_match = response.xpath(self.xpath_resultspage_maxpage)
            if maxpage_match:
                try:
                    maxpage_num = int(maxpage_match[0].extract())
                except ValueError:
                    maxpage_num = None

            if (maxpage_num == None) or (not maxpage_match):
                maxpage_num = response.meta[
                    'data-sclabs-client-lastknown-maxpage']
            self.logger.debug('%d: Last client page seems to be "%d".',
                              client_number, maxpage_num)

            if currentpage_num < maxpage_num and currentpage_num < 10000:
                self.logger.debug('%d: Current page is below limit :)',
                                  client_number)
                wanted_link = self.template_resultspage.format(
                    client_number=client_number, page=currentpage_num + 1)
                request = scrapy.Request(url=wanted_link,
                                         callback=self.read_client,
                                         errback=self.errback)
                request.meta[
                    'data-sclabs-client-lastknown-maxpage'] = maxpage_num
                request.meta[
                    'data-sclabs-client-currentpage'] = currentpage_num + 1
                request.meta['data-sclabs-client-number'] = client_number
                request.meta[
                    'data-sclabs-NTterpenes-counter'] = no_terpenes_counter
                # if currentpage_num+1 == maxpage_num:
                # 	request.meta['dont_cache'] = True
                next_page = self.session.query(
                    db.Page).filter(db.Page.source_url == wanted_link).first()
                if next_page:
                    self.logger.debug('%d: Next client page already in DB.',
                                      client_number)
                    next_page.prev_url = response.url
                    next_page.online_database_id = self.lab_id
                else:
                    self.logger.debug('%d: Next client page not in DB yet.',
                                      client_number)
                    next_page = db.Page(source_url=wanted_link,
                                        prev_url=response.url,
                                        online_database_id=self.lab_id)
                    self.session.add(next_page)
                self.session.commit()
                self.logger.debug('%d: Yielding request for next client page.',
                                  client_number)
                yield request