Esempio n. 1
0
 def last_crawl_id(self):
     if not self._last_crawl_id:
         if self.spider_id:
             db_session = Session()
             last_crawl = db_session.query(Crawl.id)\
                 .filter(Crawl.spider_id == self.spider_id)\
             .order_by(Crawl.crawl_date.desc(),
                       Crawl.id.desc())\
             .limit(1)\
             .first()
             self._last_crawl_id = last_crawl.id if last_crawl else None
             db_session.close()
     return self._last_crawl_id
Esempio n. 2
0
    def init_all_products_hashes(self):
        db_session = Session()
        spider_db = self.current_crawl.spider
        try:
            upload_dst = spider_db.account.upload_destinations[0].name
        except (TypeError, IndexError):
            upload_dst = 'new_system'
        all_products_filename = os.path.join(DATA_DIR, '%s_all_products.csv' % spider_db.website_id)
        if os.path.exists(all_products_filename):
            self.all_products_df = pd.read_csv(all_products_filename, dtype=pd.np.str)
            if not self.all_products_df.empty:
                last_date = self.all_products_df.iloc[0]['last_date']
                new_products = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id, last_date))
                if not new_products.empty:
                    self.all_products_df.append(new_products)
        if not os.path.exists(all_products_filename) or self.all_products_df.empty:
            log.msg('DELISTED DUPLICATES DETECTION: %s does not exists' % all_products_filename)
            self.all_products_df = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id))

        if not self.all_products_df.empty:
            # Check data integrity
            total_products = self.get_all_products_count(upload_dst, spider_db.website_id)
            total_collected = self.all_products_df.identifier.count()
            if total_products != total_collected:
                # Try get all products
                log.msg('DELISTED DUPLICATES DETECTION: total products count is different to number of products collected (%s / %s)' %
                        (total_products, total_collected))
                log.msg('DELISTED DUPLICATES DETECTION: trying getting all products')
                self.all_products_df = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id))
            last_crawl = db_session.query(Crawl)\
                .filter(Crawl.spider_id == spider_db.id,
                        Crawl.status == 'upload_finished')\
                .order_by(Crawl.crawl_date.desc(),
                          Crawl.id.desc()).limit(1).first()
            if last_crawl:
                self.all_products_df['last_date'] = str(last_crawl.crawl_date)
            else:
                self.all_products_df['last_date'] = str(datetime.now().date())
            try:
                self.all_products_df.to_csv(all_products_filename, index=False, encoding='utf-8')
            except:
                self.all_products_df.to_csv(all_products_filename, index=False)

            self.all_products_df = self.all_products_df.where(pd.notnull(self.all_products_df), None)
            self.gen_hashes()

        db_session.close()
Esempio n. 3
0
def get_spider_cls(spider_name):
    db_session = Session()
    db_spider = db_session.query(Spider).filter(
        Spider.name == spider_name).first()
    if not db_spider:
        return None
    db_scrapely_spider = db_session.query(ScrapelySpiderData)\
        .filter(ScrapelySpiderData.spider_id == db_spider.id)\
        .first()
    if not db_scrapely_spider:
        return None
    db_extractors = db_session.query(ScrapelySpiderExtractor)\
        .filter(ScrapelySpiderExtractor.scrapely_spider_data_id == db_scrapely_spider.id)
    if not db_extractors.count():
        return None

    db_session.close()

    return ScrapelySpider
Esempio n. 4
0
def get_spider_by_cmd(cmd):
    db_session = Session()

    jobid = extract_job_id(cmd)
    if not jobid:
        print "Couldn't extract jobid from: '%s'" % cmd
        return None

    jobid = unicode(jobid)
    spider = db_session.query(Spider)\
        .join(Crawl, Crawl.spider_id == Spider.id)\
        .filter(Crawl.jobid == jobid)\
        .filter(Crawl.status == 'running')\
        .first()

    if not spider:
        print "Couldn't find spider running with jobid: '%s'" % jobid

    db_session.close()

    return spider
Esempio n. 5
0
    def __init__(self, name, *args, **kwargs):
        self.name = name

        db_session = Session()
        self.db_spider = db_session.query(Spider).filter(
            Spider.name == self.name).first()
        if not self.db_spider:
            raise CloseSpider("Spider %s not found" % self.name)
        self.db_scrapely_spider = db_session.query(ScrapelySpiderData)\
            .filter(ScrapelySpiderData.spider_id == self.db_spider.id)\
            .first()
        if not self.db_scrapely_spider:
            raise CloseSpider("Scrapely config for spider %s not found" %
                              self.name)
        db_extractors = db_session.query(ScrapelySpiderExtractor)\
            .filter(ScrapelySpiderExtractor.scrapely_spider_data_id == self.db_scrapely_spider.id)
        if not db_extractors.count():
            raise CloseSpider("Scrapely extractors for spider %s not found" %
                              self.name)

        self.allowed_domains = []
        self.start_urls = []

        for url in json.loads(self.db_scrapely_spider.start_urls_json):
            domain, start_url = _parse_start_url(url)
            self.allowed_domains.append(domain)
            self.start_urls.append(start_url)

        domain, start_url = _parse_start_url(self.db_scrapely_spider.start_url)
        self.allowed_domains.append(domain)
        self.start_urls.append(start_url)

        super(ScrapelySpider, self).__init__(*args, **kwargs)

        dispatcher.connect(self._spider_idle, signals.spider_idle)
        dispatcher.connect(self._spider_opened, signals.spider_opened)
        dispatcher.connect(self._spider_closed, signals.spider_closed)

        db_session.close()
def main():
    db_session = Session()

    spider_usage = get_spiders_usage()

    now = datetime.datetime.now()

    for data in spider_usage:
        spider = data['spider']
        cpu_usage = data['cpu_usage']
        mem_usage = data['mem_usage']

        usage = SpiderResourcesUsage()
        usage.spider_id = spider.id
        usage.worker_server_id = spider.worker_server_id
        usage.time = now
        usage.cpu_usage = cpu_usage
        usage.mem_usage = mem_usage

        db_session.add(usage)

    db_session.commit()
    db_session.close()
Esempio n. 7
0
    def _get_spider_as_dict(self):
        spider_db = None
        spider_dict = None

        db_session = Session()

        if self._spider_id:
            spider_db = db_session.query(Spider).get(self._spider_id)
        elif self._website_id:
            spider_db = db_session.query(Spider)\
                .filter(Spider.website_id == self._website_id)\
                .one_or_none()
        elif self._spider_name:
            spider_db = db_session.query(Spider)\
                .filter(Spider.name == self._spider_name)\
                .one_or_none()

        if spider_db:
            spider_dict = spider_db.serialize()

        db_session.close()

        return spider_dict
Esempio n. 8
0
    def export_delisted_duplicate_errors(self):
        website_id = self.current_crawl.spider.website_id
        crawl_id = self.current_crawl.id
        filename = '%s_%s_delisted_duplicate_errors.csv' % (website_id, crawl_id)
        filename_full = os.path.join(DATA_DIR, filename)
        errors_df = pd.DataFrame(self.errors, dtype=pd.np.str)
        try:
            errors_df.to_csv(filename_full, index=False, encoding='utf-8')
        except:
            errors_df.to_csv(filename_full, index=False)

        db_session = Session()
        dd_error = db_session.query(DelistedDuplicateError)\
            .filter(DelistedDuplicateError.website_id == website_id,
                    DelistedDuplicateError.crawl_id == crawl_id)\
            .first()
        if not dd_error:
            dd_error = DelistedDuplicateError()
        dd_error.website_id = website_id
        dd_error.crawl_id = crawl_id
        dd_error.filename = filename
        db_session.add(dd_error)
        db_session.commit()
        db_session.close()