Ejemplo n.º 1
0
    def add_job(self, job_item):
        """

        Add job to PyJobsWeb database

        :param job_item: Scrapy pyjobs_crawlers item object
        :return:
        """
        job_public_id = job_item['url']

        if self.job_exist(job_public_id):
            print 'Skip existing item'
            return

        job = Job()
        attributes = ['title', 'description', 'company', 'address', 'company_url',
                      'publication_datetime', 'publication_datetime_is_fake']

        # Populate job attributes if item contain it
        for attribute in attributes:
            if attribute in job_item:
                setattr(job, attribute, job_item[attribute])

        job.url = job_item['url']
        job.source = job_item['source']
        job.crawl_datetime = job_item['initial_crawl_datetime']

        if 'tags' in job_item:
            import json
            tags = [{'tag': t.tag, 'weight': t.weight} for t in job_item['tags']]
            job.tags = json.dumps(tags)

        DBSession.add(job)
        transaction.commit()
Ejemplo n.º 2
0
def save_item_as_job(item):
    # def uid(item):
    #     return '{}--{}'.format(item['source'], item['source_local_uid'])
    #
    existing = DBSession.query(Job).filter(Job.url==item['url']).count()
    if existing:
        print 'Skip existing item'
        return

    job = Job()
    attributes = ['title', 'description', 'company', 'address', 'company_url',
                  'publication_datetime']

    # Populate job attributes if item contain it
    for attribute in attributes:
        if attribute in item:
            setattr(job, attribute, item[attribute])

    job.url = item['url']
    job.crawl_datetime = item['initial_crawl_datetime']

    if 'tags' in item:
        import json
        tags = [{'tag': t.tag, 'weight': t.weight} for t in item['tags']]
        job.tags = json.dumps(tags)

    DBSession.add(job)
    transaction.commit()
Ejemplo n.º 3
0
    def submit(self, *args, **kwargs):
        company = self._build_company_obj(**kwargs)

        transaction.begin()
        DBSession.add(company)
        transaction.commit()

        self._redirect()
Ejemplo n.º 4
0
    def submit(self, *args, **kwargs):
        company = self._build_company_obj(**kwargs)

        transaction.begin()
        DBSession.add(company)
        transaction.commit()

        self._redirect()
Ejemplo n.º 5
0
 def set_geolocation(cls, company_id, lat, lon):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == company_id) \
         .update({'latitude': lat,
                  'longitude': lon,
                  'geolocation_is_valid': True})
     transaction.commit()
Ejemplo n.º 6
0
 def set_geolocation(cls, company_id, lat, lon):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == company_id) \
         .update({'latitude': lat,
                  'longitude': lon,
                  'geolocation_is_valid': True})
     transaction.commit()
Ejemplo n.º 7
0
 def get_not_pushed_on_twitter(cls, limit=None):
     if limit:
         return DBSession.query(cls) \
             .filter_by(pushed_on_twitter=False) \
             .order_by(cls.id.asc()) \
             .limit(limit)
     else:
         return DBSession.query(cls) \
             .filter_by(pushed_on_twitter=False) \
             .order_by(cls.id.asc())
Ejemplo n.º 8
0
 def get_not_pushed_on_twitter(cls, limit=None):
     if limit:
         return DBSession.query(cls) \
             .filter_by(pushed_on_twitter=False) \
             .order_by(cls.id.asc()) \
             .limit(limit)
     else:
         return DBSession.query(cls) \
             .filter_by(pushed_on_twitter=False) \
             .order_by(cls.id.asc())
Ejemplo n.º 9
0
 def update_last_sync(cls, job_id, timestamp):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == job_id) \
         .update({'last_sync': timestamp,
                  'last_modified': cls.last_modified})
     DBSession.query(cls) \
         .filter(cls.id == job_id) \
         .filter(cls.last_modified < timestamp) \
         .update({'last_modified': timestamp})
     transaction.commit()
Ejemplo n.º 10
0
 def update_last_sync(cls, job_id, timestamp):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == job_id) \
         .update({'last_sync': timestamp,
                  'last_modified': cls.last_modified})
     DBSession.query(cls) \
         .filter(cls.id == job_id) \
         .filter(cls.last_modified < timestamp) \
         .update({'last_modified': timestamp})
     transaction.commit()
Ejemplo n.º 11
0
    def log(self, source, action, more=None):
        if more is not None:
            message = '%s (%s)' % (action, more)
        else:
            message = action

        log = Log()
        log.source = source
        log.message = message
        log.datetime = datetime.datetime.now()

        DBSession.add(log)
        transaction.commit()
Ejemplo n.º 12
0
    def rss(self, limit=50, source=None, *args, **kwargs):
        """
        RSS feed of jobs
        :param source: source name
        :param limit: number of displayed jobs
        :return: RSS feed content
        """
        site_url = config.get('site.domain_base_url')
        feed = feedgenerator.Rss201rev2Feed(
            title=u"pyjobs : le job qu'il vous faut en python",
            link=site_url,
            description=u"Agrégation de jobs python",
            language=u"fr",
            feed_url=u"http://www.pyjobs.fr/rss?limit=%s" % limit
        )

        jobs = DBSession.query(model.JobAlchemy) \
            .order_by(model.JobAlchemy.publication_datetime.desc()) \
            .limit(limit)

        if source is not None:
            jobs = jobs.filter(model.JobAlchemy.source == source)

        for job in jobs:
            job_slug = slugify(job.title)
            feed.add_item(
                title=job.title,
                link=get_job_url(job.id, job_title=job.title, absolute=True),
                description=job.description,
                pubdate=job.publication_datetime,
                unique_id="%s/job/%d/%s" % (site_url, job.id, job_slug)
            )

        return feed.writeString('utf-8')
Ejemplo n.º 13
0
    def rss(self, limit=50, source=None, *args, **kwargs):
        """
        RSS feed of jobs
        :param source: source name
        :param limit: number of displayed jobs
        :return: RSS feed content
        """
        site_url = config.get('site.domain_base_url')
        feed = feedgenerator.Rss201rev2Feed(
            title=u"pyjobs : le job qu'il vous faut en python",
            link=site_url,
            description=u"Agrégation de jobs python",
            language=u"fr",
            feed_url=u"http://www.pyjobs.fr/rss?limit=%s" % limit)

        jobs = DBSession.query(model.JobAlchemy) \
            .order_by(model.JobAlchemy.publication_datetime.desc()) \
            .limit(limit)

        if source is not None:
            jobs = jobs.filter(model.JobAlchemy.source == source)

        for job in jobs:
            job_slug = slugify(job.title)
            feed.add_item(title=job.title,
                          link=get_job_url(job.id,
                                           job_title=job.title,
                                           absolute=True),
                          description=job.description,
                          pubdate=job.publication_datetime,
                          unique_id="%s/job/%d/%s" %
                          (site_url, job.id, job_slug))

        return feed.writeString('utf-8')
Ejemplo n.º 14
0
    def rss(self, limit=50, source=None):
        """
        RSS feed of jobs
        :param source: source name
        :param limit: number of displayed jobs
        :return: RSS feed content
        """
        site_url = config.get("site.domain_base_url")
        feed = feedgenerator.Rss201rev2Feed(
            title=u"PyJobs: Le job qu'il vous faut en python",
            link=site_url,
            description=u"Agrégation de jobs python",
            language=u"fr",
            feed_url=u"http://www.pyjobs.fr/rss?limit=%s" % limit,
        )

        jobs = DBSession.query(Job).order_by(Job.publication_datetime.desc()).limit(limit)

        if source is not None:
            jobs = jobs.filter(Job.source == source)

        for job in jobs:
            job_slug = slugify(job.title)
            feed.add_item(
                title=job.title,
                link=job.url,
                description=job.description,
                pubdate=job.publication_datetime,
                unique_id="%s/job/%d/%s" % (site_url, job.id, job_slug),
            )

        return feed.writeString("utf-8")
Ejemplo n.º 15
0
    def index(self, source=None):

        jobs = DBSession.query(Job).order_by(Job.publication_datetime.desc())

        if source is not None:
            jobs = jobs.filter(Job.source == source)

        return dict(sources=SOURCES, jobs=jobs)
Ejemplo n.º 16
0
 def job(self, job_id, job_title=None, previous=None):
     """
     Job detail page
     :param job_id: Job identifier
     :param job_title: Job title (optional) for pretty url
     :return: dict
     """
     try:
         job = DBSession.query(Job).filter_by(id=job_id).one()
     except NoResultFound:
         pass  # TODO: TubroGears 404 ?
     return dict(job=job, sources=SOURCES)
Ejemplo n.º 17
0
    def add_job(self, job_item):
        """

        Add job to PyJobsWeb database

        :param job_item: Scrapy pyjobs_crawlers item object
        :return:
        """
        job_public_id = job_item['url']

        if self.job_exist(job_public_id):
            print 'Skip existing item'
            return

        job = model.JobAlchemy()

        # Populate attributes which do not require special treatments before
        # population
        attributes = ['title', 'description', 'company', 'address',
                      'company_url', 'publication_datetime',
                      'publication_datetime_is_fake']

        # Populate job attributes if item contain it
        for attribute in attributes:
            if attribute in job_item:
                setattr(job, attribute, job_item[attribute])

        job.url = job_item['url']
        job.source = job_item['source']
        job.crawl_datetime = job_item['initial_crawl_datetime']

        # Populate attributes which require special treatments before population
        if 'tags' in job_item:
            tags = [{'tag': t.tag, 'weight': t.weight}
                    for t in job_item['tags']]
            job.tags = json.dumps(tags)

        # Insert the job offer in the Postgresql database
        DBSession.add(job)
        transaction.commit()
Ejemplo n.º 18
0
    def sources(self):

        sources_last_crawl = {}
        for source_name in SOURCES:
            try:
                sources_last_crawl[source_name] = (
                    DBSession.query(Log.datetime)
                    .filter(Log.source == source_name)
                    .order_by(Log.datetime.desc())
                    .limit(1)
                    .one()[0]
                )
            except NoResultFound:
                sources_last_crawl[source_name] = None

        return dict(sources=SOURCES, existing_fields=existing_fields, sources_last_crawl=sources_last_crawl)
Ejemplo n.º 19
0
    def origine_des_annonces_diffusees(self, *args, **kwargs):
        sources_last_crawl = {}
        sorted_sources = collections.OrderedDict(
            sorted(SOURCES.items(), key=lambda x: x[1].label))
        for source_name in sorted_sources:
            try:
                sources_last_crawl[source_name] = DBSession.query(Log.datetime) \
                    .filter(Log.source == source_name) \
                    .order_by(Log.datetime.desc()) \
                    .limit(1) \
                    .one()[0]
            except NoResultFound:
                sources_last_crawl[source_name] = None

        return dict(sources=sorted_sources,
                    existing_fields=existing_fields,
                    sources_last_crawl=sources_last_crawl)
Ejemplo n.º 20
0
    def sources(self):
        sources_last_crawl = {}
        sorted_sources = collections.OrderedDict(sorted(SOURCES.items(), key=lambda x: x[1].label))
        for source_name in sorted_sources:
            try:
                sources_last_crawl[source_name] = DBSession.query(Log.datetime) \
                    .filter(Log.source == source_name) \
                    .order_by(Log.datetime.desc()) \
                    .limit(1)\
                    .one()[0]
            except NoResultFound:
                sources_last_crawl[source_name] = None

        return dict(
                sources=sorted_sources,
                existing_fields=existing_fields,
                sources_last_crawl=sources_last_crawl
        )
Ejemplo n.º 21
0
    def logs(self, source=None, last_days=1, *args, **kwargs):

        logs_query = DBSession.query(Log) \
            .order_by(Log.datetime.desc()) \
            .filter(
            Log.datetime >= datetime.datetime.now() + datetime.timedelta(
                days=-int(last_days))) \
            .filter(Log.message.in_(('CRAWL_LIST_START',
                                     'CRAWL_LIST_FINISHED',
                                     'ERROR_UNEXPECTED_END',
                                     'ERROR_CRAWNLING')))

        if source is not None:
            logs_query = logs_query.filter(Log.source == source)

        return dict(sources=SOURCES,
                    logs=logs_query.all(),
                    last_days=last_days)
Ejemplo n.º 22
0
    def logs(self, source=None, last_days=1):

        logs_query = DBSession.query(Log)\
            .order_by(Log.datetime.desc())\
            .filter(Log.datetime >= datetime.datetime.now() + datetime.timedelta(days=-int(last_days)))\
            .filter(Log.message.in_(('CRAWL_LIST_START',
                                    'CRAWL_LIST_FINISHED',
                                    'ERROR_UNEXPECTED_END',
                                    'ERROR_CRAWNLING')))

        if source is not None:
            logs_query = logs_query.filter(Log.source == source)

        return dict(
            sources=SOURCES,
            logs=logs_query.all(),
            last_days=last_days
        )
Ejemplo n.º 23
0
 def setUp(self):
     """Setup test fixture for each model test method."""
     try:
         new_attrs = {}
         new_attrs.update(self.attrs)
         new_attrs.update(self.do_get_dependencies())
         self.obj = self.klass(**new_attrs)
         DBSession.add(self.obj)
         DBSession.flush()
         return self.obj
     except:
         DBSession.rollback()
         raise
Ejemplo n.º 24
0
 def get_validated_company(cls, company_id):
     return DBSession.query(cls) \
         .filter(cls.id == company_id) \
         .filter_by(validated=True) \
         .one()
Ejemplo n.º 25
0
 def get_company(cls, company_id):
     return DBSession.query(cls).filter(cls.id == company_id).one()
Ejemplo n.º 26
0
 def reset_last_sync(cls):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.validated) \
         .update({'last_sync': base_time()})
     transaction.commit()
Ejemplo n.º 27
0
 def get_validated_companies(cls):
     return DBSession.query(cls) \
         .filter_by(validated=True) \
         .order_by(cls.name.asc())
Ejemplo n.º 28
0
 def get_dirty_rows(cls):
     return DBSession.query(cls) \
         .filter(cls.validated) \
         .filter(cls.last_modified > cls.last_sync) \
         .order_by(cls.id.asc())
Ejemplo n.º 29
0
 def job_offer_exists(cls, url):
     return DBSession.query(cls).filter(cls.url == url).count()
Ejemplo n.º 30
0
 def set_pushed_on_twitter(cls, offer_id, pushed_on_twitter):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == offer_id) \
         .update({'pushed_on_twitter': pushed_on_twitter})
     transaction.commit()
Ejemplo n.º 31
0
 def get_validated_company(cls, company_id):
     return DBSession.query(cls) \
         .filter(cls.id == company_id) \
         .filter_by(validated=True) \
         .one()
Ejemplo n.º 32
0
 def set_address_is_valid(cls, company_id, is_valid):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == company_id) \
         .update({'address_is_valid': is_valid})
     transaction.commit()
Ejemplo n.º 33
0
 def set_geolocation_is_valid(cls, offer_id, is_valid):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == offer_id) \
         .update({'geolocation_is_valid': is_valid})
     transaction.commit()
Ejemplo n.º 34
0
 def get_dirty_rows(cls):
     return DBSession.query(cls) \
         .filter(cls.validated) \
         .filter(cls.last_modified > cls.last_sync) \
         .order_by(cls.id.asc())
Ejemplo n.º 35
0
 def tearDown(self):
     """Tear down test fixture for each model test method."""
     DBSession.rollback()
Ejemplo n.º 36
0
 def by_email_address(cls, email):
     """Return the user object whose email address is ``email``."""
     return DBSession.query(cls).filter_by(email_address=email).first()
Ejemplo n.º 37
0
 def get_pending_geolocations(cls):
     return DBSession.query(cls) \
         .filter_by(address_is_valid=True) \
         .filter_by(geolocation_is_valid=False) \
         .filter_by(validated=True) \
         .order_by(cls.id.asc())
Ejemplo n.º 38
0
 def reset_last_sync(cls):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.validated) \
         .update({'last_sync': base_time()})
     transaction.commit()
Ejemplo n.º 39
0
 def set_pushed_on_twitter(cls, offer_id, pushed_on_twitter):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == offer_id) \
         .update({'pushed_on_twitter': pushed_on_twitter})
     transaction.commit()
Ejemplo n.º 40
0
 def get_validated_companies(cls):
     return DBSession.query(cls) \
         .filter_by(validated=True) \
         .order_by(cls.name.asc())
Ejemplo n.º 41
0
 def set_address_is_valid(cls, company_id, is_valid):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == company_id) \
         .update({'address_is_valid': is_valid})
     transaction.commit()
Ejemplo n.º 42
0
 def get_invalid_addresses(cls):
     return DBSession.query(cls).filter_by(address_is_valid=False)
Ejemplo n.º 43
0
 def test_query_obj(self):
     """Model objects can be queried"""
     obj = DBSession.query(self.klass).one()
     for key, value in self.attrs.items():
         eq_(getattr(obj, key), value)
Ejemplo n.º 44
0
 def get_job_offer(cls, offer_id):
     return DBSession.query(cls).filter(cls.id == offer_id).one()
Ejemplo n.º 45
0
 def by_email_address(cls, email):
     """Return the user object whose email address is ``email``."""
     return DBSession.query(cls).filter_by(email_address=email).first()
Ejemplo n.º 46
0
 def by_user_name(cls, username):
     """Return the user object whose user name is ``username``."""
     return DBSession.query(cls).filter_by(user_name=username).first()
Ejemplo n.º 47
0
def current_server_timestamp():
    return DBSession.execute(func.current_timestamp()).scalar()
Ejemplo n.º 48
0
 def get_all_job_offers(cls):
     return DBSession.query(cls).order_by(cls.publication_datetime.desc())
Ejemplo n.º 49
0
 def get_company(cls, company_id):
     return DBSession.query(cls).filter(cls.id == company_id).one()
Ejemplo n.º 50
0
 def set_geolocation_is_valid(cls, offer_id, is_valid):
     transaction.begin()
     DBSession.query(cls) \
         .filter(cls.id == offer_id) \
         .update({'geolocation_is_valid': is_valid})
     transaction.commit()
Ejemplo n.º 51
0
 def by_user_name(cls, username):
     """Return the user object whose user name is ``username``."""
     return DBSession.query(cls).filter_by(user_name=username).first()
Ejemplo n.º 52
0
 def get_pending_geolocations(cls):
     return DBSession.query(cls) \
         .filter_by(address_is_valid=True) \
         .filter_by(geolocation_is_valid=False) \
         .filter_by(validated=True) \
         .order_by(cls.id.asc())