コード例 #1
0
ファイル: spiderdata.py プロジェクト: oceancloud82/scraping
 def last_crawl_id(self):
     if not self._last_crawl_id:
         if self.spider_id:
             db_session = Session()
             last_crawl = db_session.query(Crawl.id)\
                 .filter(Crawl.spider_id == self.spider_id)\
             .order_by(Crawl.crawl_date.desc(),
                       Crawl.id.desc())\
             .limit(1)\
             .first()
             self._last_crawl_id = last_crawl.id if last_crawl else None
             db_session.close()
     return self._last_crawl_id
コード例 #2
0
    def init_all_products_hashes(self):
        db_session = Session()
        spider_db = self.current_crawl.spider
        try:
            upload_dst = spider_db.account.upload_destinations[0].name
        except (TypeError, IndexError):
            upload_dst = 'new_system'
        all_products_filename = os.path.join(DATA_DIR, '%s_all_products.csv' % spider_db.website_id)
        if os.path.exists(all_products_filename):
            self.all_products_df = pd.read_csv(all_products_filename, dtype=pd.np.str)
            if not self.all_products_df.empty:
                last_date = self.all_products_df.iloc[0]['last_date']
                new_products = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id, last_date))
                if not new_products.empty:
                    self.all_products_df.append(new_products)
        if not os.path.exists(all_products_filename) or self.all_products_df.empty:
            log.msg('DELISTED DUPLICATES DETECTION: %s does not exists' % all_products_filename)
            self.all_products_df = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id))

        if not self.all_products_df.empty:
            # Check data integrity
            total_products = self.get_all_products_count(upload_dst, spider_db.website_id)
            total_collected = self.all_products_df.identifier.count()
            if total_products != total_collected:
                # Try get all products
                log.msg('DELISTED DUPLICATES DETECTION: total products count is different to number of products collected (%s / %s)' %
                        (total_products, total_collected))
                log.msg('DELISTED DUPLICATES DETECTION: trying getting all products')
                self.all_products_df = pd.DataFrame(self.get_all_products_website(upload_dst, spider_db.website_id))
            last_crawl = db_session.query(Crawl)\
                .filter(Crawl.spider_id == spider_db.id,
                        Crawl.status == 'upload_finished')\
                .order_by(Crawl.crawl_date.desc(),
                          Crawl.id.desc()).limit(1).first()
            if last_crawl:
                self.all_products_df['last_date'] = str(last_crawl.crawl_date)
            else:
                self.all_products_df['last_date'] = str(datetime.now().date())
            try:
                self.all_products_df.to_csv(all_products_filename, index=False, encoding='utf-8')
            except:
                self.all_products_df.to_csv(all_products_filename, index=False)

            self.all_products_df = self.all_products_df.where(pd.notnull(self.all_products_df), None)
            self.gen_hashes()

        db_session.close()
コード例 #3
0
def get_spider_by_cmd(cmd):
    db_session = Session()

    jobid = extract_job_id(cmd)
    if not jobid:
        print "Couldn't extract jobid from: '%s'" % cmd
        return None

    jobid = unicode(jobid)
    spider = db_session.query(Spider)\
        .join(Crawl, Crawl.spider_id == Spider.id)\
        .filter(Crawl.jobid == jobid)\
        .filter(Crawl.status == 'running')\
        .first()

    if not spider:
        print "Couldn't find spider running with jobid: '%s'" % jobid

    db_session.close()

    return spider
コード例 #4
0
    def export_delisted_duplicate_errors(self):
        website_id = self.current_crawl.spider.website_id
        crawl_id = self.current_crawl.id
        filename = '%s_%s_delisted_duplicate_errors.csv' % (website_id, crawl_id)
        filename_full = os.path.join(DATA_DIR, filename)
        errors_df = pd.DataFrame(self.errors, dtype=pd.np.str)
        try:
            errors_df.to_csv(filename_full, index=False, encoding='utf-8')
        except:
            errors_df.to_csv(filename_full, index=False)

        db_session = Session()
        dd_error = db_session.query(DelistedDuplicateError)\
            .filter(DelistedDuplicateError.website_id == website_id,
                    DelistedDuplicateError.crawl_id == crawl_id)\
            .first()
        if not dd_error:
            dd_error = DelistedDuplicateError()
        dd_error.website_id = website_id
        dd_error.crawl_id = crawl_id
        dd_error.filename = filename
        db_session.add(dd_error)
        db_session.commit()
        db_session.close()
コード例 #5
0
def get_spider_cls(spider_name):
    db_session = Session()
    db_spider = db_session.query(Spider).filter(
        Spider.name == spider_name).first()
    if not db_spider:
        return None
    db_scrapely_spider = db_session.query(ScrapelySpiderData)\
        .filter(ScrapelySpiderData.spider_id == db_spider.id)\
        .first()
    if not db_scrapely_spider:
        return None
    db_extractors = db_session.query(ScrapelySpiderExtractor)\
        .filter(ScrapelySpiderExtractor.scrapely_spider_data_id == db_scrapely_spider.id)
    if not db_extractors.count():
        return None

    db_session.close()

    return ScrapelySpider
コード例 #6
0
def main():
    db_session = Session()

    spider_usage = get_spiders_usage()

    now = datetime.datetime.now()

    for data in spider_usage:
        spider = data['spider']
        cpu_usage = data['cpu_usage']
        mem_usage = data['mem_usage']

        usage = SpiderResourcesUsage()
        usage.spider_id = spider.id
        usage.worker_server_id = spider.worker_server_id
        usage.time = now
        usage.cpu_usage = cpu_usage
        usage.mem_usage = mem_usage

        db_session.add(usage)

    db_session.commit()
    db_session.close()
コード例 #7
0
ファイル: spider.py プロジェクト: oceancloud82/scraping
    def __init__(self, name, *args, **kwargs):
        self.name = name

        db_session = Session()
        self.db_spider = db_session.query(Spider).filter(
            Spider.name == self.name).first()
        if not self.db_spider:
            raise CloseSpider("Spider %s not found" % self.name)
        self.db_scrapely_spider = db_session.query(ScrapelySpiderData)\
            .filter(ScrapelySpiderData.spider_id == self.db_spider.id)\
            .first()
        if not self.db_scrapely_spider:
            raise CloseSpider("Scrapely config for spider %s not found" %
                              self.name)
        db_extractors = db_session.query(ScrapelySpiderExtractor)\
            .filter(ScrapelySpiderExtractor.scrapely_spider_data_id == self.db_scrapely_spider.id)
        if not db_extractors.count():
            raise CloseSpider("Scrapely extractors for spider %s not found" %
                              self.name)

        self.allowed_domains = []
        self.start_urls = []

        for url in json.loads(self.db_scrapely_spider.start_urls_json):
            domain, start_url = _parse_start_url(url)
            self.allowed_domains.append(domain)
            self.start_urls.append(start_url)

        domain, start_url = _parse_start_url(self.db_scrapely_spider.start_url)
        self.allowed_domains.append(domain)
        self.start_urls.append(start_url)

        super(ScrapelySpider, self).__init__(*args, **kwargs)

        dispatcher.connect(self._spider_idle, signals.spider_idle)
        dispatcher.connect(self._spider_opened, signals.spider_opened)
        dispatcher.connect(self._spider_closed, signals.spider_closed)

        db_session.close()
コード例 #8
0
ファイル: spiderdata.py プロジェクト: oceancloud82/scraping
    def _get_spider_as_dict(self):
        spider_db = None
        spider_dict = None

        db_session = Session()

        if self._spider_id:
            spider_db = db_session.query(Spider).get(self._spider_id)
        elif self._website_id:
            spider_db = db_session.query(Spider)\
                .filter(Spider.website_id == self._website_id)\
                .one_or_none()
        elif self._spider_name:
            spider_db = db_session.query(Spider)\
                .filter(Spider.name == self._spider_name)\
                .one_or_none()

        if spider_db:
            spider_dict = spider_db.serialize()

        db_session.close()

        return spider_dict
コード例 #9
0
        return False
    else:
        return True


if __name__ == '__main__':
    pid_file_name = 'crawl.pid'
    pid_file = os.path.join(here, pid_file_name)
    if os.path.exists(pid_file):
        try:
            pid = int(open(pid_file).read())
        except ValueError:
            os.unlink(pid_file)
        else:
            if check_pid(pid):
                print 'The script is running with pid=%s' % pid
                sys.exit(1)
            else:
                os.unlink(pid_file)

    open(pid_file, 'w').write(str(os.getpid()))

    db_session = Session()
    spider_names = None
    if len(sys.argv) > 1:
        spider_names = sys.argv[1:]

    schedule_crawls_on_workers(db_session)

    os.unlink(pid_file)
コード例 #10
0
from sqlalchemy.sql import text

HERE = os.path.dirname(os.path.abspath(__file__))
product_spiders_root = os.path.dirname(HERE)
project_root = os.path.dirname(product_spiders_root)

sys.path.append(project_root)
sys.path.append(os.path.join(project_root, 'product_spiders'))

from product_spiders.db import Session
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
from productspidersweb.models import Spider

print sys.path
here = os.path.abspath(os.path.dirname(__file__))

db_session = Session()

spider_modules = ['product_spiders.spiders']

for name in spider_modules:
    for module in walk_modules(name):
        for spider in iter_spider_classes(module):
            sp = db_session.query(Spider).filter(
                Spider.name == spider.name).first()
            if sp:
                sp.module = str(spider.__module__)
                db_session.add(sp)

db_session.commit()
コード例 #11
0
def main():
    db_session = Session()
    scheduler = SpiderUploadNotificationScheduler()
    e = EmailNotifier(SMTP_USER, SMTP_PASS, SMTP_FROM, SMTP_HOST, SMTP_PORT)
    spider_uploads = db_session.query(SpiderUpload).all()
    for s in spider_uploads:
        if not s.user.email:
            continue
        if scheduler.should_send_initial(s):
            subject = 'Spider upload request %s' % s.spider_name
            text = 'A spider upload has been assigned to you:\n'
            text += 'Account: %s\n' % (s.account.name if s.account else 'New account')
            text += 'Spider: %s\n' % s.spider_name
            if s.notes:
                text += 'Notes: %s' % s.notes
            e.send_notification([s.user.email] + TO, subject, text)
            print s.user.email
            s.last_notification = datetime.now()
            db_session.add(s)
        elif scheduler.should_send_final(s):
            subject = 'Spider deployed %s' % s.spider_name
            text = 'The following spider has been deployed:\n'
            text += 'Account: %s\n' % (s.account.name if s.account else 'New account')
            text += 'Spider: %s\n' % s.spider_name
            if s.notes:
                text += 'Notes: %s' % s.notes
            e.send_notification([s.user.email] + TO, subject, text)
            s.last_notification = datetime.now()
            db_session.add(s)
        elif scheduler.should_send_reminder(s):
            subject = 'Spider upload reminder %s' % s.spider_name
            text = 'The following spider has been assigned to you:\n'
            text += 'Account: %s\n' % (s.account.name if s.account else 'New account')
            text += 'Spider: %s\n' % s.spider_name
            if s.notes:
                text += 'Notes: %s' % s.notes
            e.send_notification([s.user.email], subject, text)
            s.last_notification = datetime.now()
            db_session.add(s)

        db_session.commit()