def last_crawl_id(self): if not self._last_crawl_id: if self.spider_id: db_session = Session() last_crawl = db_session.query(Crawl.id)\ .filter(Crawl.spider_id == self.spider_id)\ .order_by(Crawl.crawl_date.desc(), Crawl.id.desc())\ .limit(1)\ .first() self._last_crawl_id = last_crawl.id if last_crawl else None db_session.close() return self._last_crawl_id
def init_all_products_hashes(self): db_session = Session() spider_db = self.current_crawl.spider try: upload_dst = spider_db.account.upload_destinations[0].name except (TypeError, IndexError): upload_dst = 'new_system' all_products_filename = os.path.join(DATA_DIR, '%s_all_products.csv' % spider_db.website_id) if os.path.exists(all_products_filename): self.all_products_df = pd.read_csv(all_products_filename, dtype=pd.np.str) if not self.all_products_df.empty: last_date = self.all_products_df.iloc[0]['last_date'] new_products = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id, last_date)) if not new_products.empty: self.all_products_df.append(new_products) if not os.path.exists(all_products_filename) or self.all_products_df.empty: log.msg('DELISTED DUPLICATES DETECTION: %s does not exists' % all_products_filename) self.all_products_df = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id)) if not self.all_products_df.empty: # Check data integrity total_products = self.get_all_products_count(upload_dst, spider_db.website_id) total_collected = self.all_products_df.identifier.count() if total_products != total_collected: # Try get all products log.msg('DELISTED DUPLICATES DETECTION: total products count is different to number of products collected (%s / %s)' % (total_products, total_collected)) log.msg('DELISTED DUPLICATES DETECTION: trying getting all products') self.all_products_df = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id)) last_crawl = db_session.query(Crawl)\ .filter(Crawl.spider_id == spider_db.id, Crawl.status == 'upload_finished')\ .order_by(Crawl.crawl_date.desc(), Crawl.id.desc()).limit(1).first() if last_crawl: self.all_products_df['last_date'] = str(last_crawl.crawl_date) else: self.all_products_df['last_date'] = str(datetime.now().date()) try: self.all_products_df.to_csv(all_products_filename, index=False, encoding='utf-8') except: self.all_products_df.to_csv(all_products_filename, index=False) self.all_products_df = self.all_products_df.where(pd.notnull(self.all_products_df), None) self.gen_hashes() db_session.close()
def get_spider_cls(spider_name): db_session = Session() db_spider = db_session.query(Spider).filter( Spider.name == spider_name).first() if not db_spider: return None db_scrapely_spider = db_session.query(ScrapelySpiderData)\ .filter(ScrapelySpiderData.spider_id == db_spider.id)\ .first() if not db_scrapely_spider: return None db_extractors = db_session.query(ScrapelySpiderExtractor)\ .filter(ScrapelySpiderExtractor.scrapely_spider_data_id == db_scrapely_spider.id) if not db_extractors.count(): return None db_session.close() return ScrapelySpider
def get_spider_by_cmd(cmd): db_session = Session() jobid = extract_job_id(cmd) if not jobid: print "Couldn't extract jobid from: '%s'" % cmd return None jobid = unicode(jobid) spider = db_session.query(Spider)\ .join(Crawl, Crawl.spider_id == Spider.id)\ .filter(Crawl.jobid == jobid)\ .filter(Crawl.status == 'running')\ .first() if not spider: print "Couldn't find spider running with jobid: '%s'" % jobid db_session.close() return spider
def __init__(self, name, *args, **kwargs): self.name = name db_session = Session() self.db_spider = db_session.query(Spider).filter( Spider.name == self.name).first() if not self.db_spider: raise CloseSpider("Spider %s not found" % self.name) self.db_scrapely_spider = db_session.query(ScrapelySpiderData)\ .filter(ScrapelySpiderData.spider_id == self.db_spider.id)\ .first() if not self.db_scrapely_spider: raise CloseSpider("Scrapely config for spider %s not found" % self.name) db_extractors = db_session.query(ScrapelySpiderExtractor)\ .filter(ScrapelySpiderExtractor.scrapely_spider_data_id == self.db_scrapely_spider.id) if not db_extractors.count(): raise CloseSpider("Scrapely extractors for spider %s not found" % self.name) self.allowed_domains = [] self.start_urls = [] for url in json.loads(self.db_scrapely_spider.start_urls_json): domain, start_url = _parse_start_url(url) self.allowed_domains.append(domain) self.start_urls.append(start_url) domain, start_url = _parse_start_url(self.db_scrapely_spider.start_url) self.allowed_domains.append(domain) self.start_urls.append(start_url) super(ScrapelySpider, self).__init__(*args, **kwargs) dispatcher.connect(self._spider_idle, signals.spider_idle) dispatcher.connect(self._spider_opened, signals.spider_opened) dispatcher.connect(self._spider_closed, signals.spider_closed) db_session.close()
def main(): db_session = Session() spider_usage = get_spiders_usage() now = datetime.datetime.now() for data in spider_usage: spider = data['spider'] cpu_usage = data['cpu_usage'] mem_usage = data['mem_usage'] usage = SpiderResourcesUsage() usage.spider_id = spider.id usage.worker_server_id = spider.worker_server_id usage.time = now usage.cpu_usage = cpu_usage usage.mem_usage = mem_usage db_session.add(usage) db_session.commit() db_session.close()
def _get_spider_as_dict(self): spider_db = None spider_dict = None db_session = Session() if self._spider_id: spider_db = db_session.query(Spider).get(self._spider_id) elif self._website_id: spider_db = db_session.query(Spider)\ .filter(Spider.website_id == self._website_id)\ .one_or_none() elif self._spider_name: spider_db = db_session.query(Spider)\ .filter(Spider.name == self._spider_name)\ .one_or_none() if spider_db: spider_dict = spider_db.serialize() db_session.close() return spider_dict
def export_delisted_duplicate_errors(self): website_id = self.current_crawl.spider.website_id crawl_id = self.current_crawl.id filename = '%s_%s_delisted_duplicate_errors.csv' % (website_id, crawl_id) filename_full = os.path.join(DATA_DIR, filename) errors_df = pd.DataFrame(self.errors, dtype=pd.np.str) try: errors_df.to_csv(filename_full, index=False, encoding='utf-8') except: errors_df.to_csv(filename_full, index=False) db_session = Session() dd_error = db_session.query(DelistedDuplicateError)\ .filter(DelistedDuplicateError.website_id == website_id, DelistedDuplicateError.crawl_id == crawl_id)\ .first() if not dd_error: dd_error = DelistedDuplicateError() dd_error.website_id = website_id dd_error.crawl_id = crawl_id dd_error.filename = filename db_session.add(dd_error) db_session.commit() db_session.close()