def main(): google_service_account_path = Path(__file__).parent / 'google_service_account.json' google_service_account_json = os.getenv('GOOGLE_SERVICE_ACCOUNT') or google_service_account_path.read_text() google_service_account = json.loads(google_service_account_json) google_scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] credentials = ServiceAccountCredentials.from_json_keyfile_dict(google_service_account, google_scope) doc_key = '1TO5Yzk0-4V_RzRK5Jr9I_pF5knZsEZrNn2HKTXrHgls' doc = gspread.authorize(credentials).open_by_key(doc_key) records = doc.worksheet('jobs').get_all_records(default_blank=None) with db: for model in [Job, JobError, JobDropped]: model.drop_table() model.create_table() for record in records: Job.create(**coerce_record(record)) Pool().map(run_spider, [ 'linkedin', 'stackoverflow', 'startupjobs', ])
def main(): google_analytics_metrics = fetch_from_google_analytics() mailchimp_metrics = fetch_from_mailchimp() with db: GlobalMetric.drop_table() GlobalMetric.create_table() GlobalMetric.create(name='avg_monthly_users', value=google_analytics_metrics['avg_monthly_users']) GlobalMetric.create(name='avg_monthly_pageviews', value=google_analytics_metrics['avg_monthly_pageviews']) GlobalMetric.create(name='subscribers', value=mailchimp_metrics['subscribers']) JobMetric.drop_table() JobMetric.create_table() for url, value in google_analytics_metrics['users_per_job'].items(): try: job = Job.get_by_url(url) JobMetric.create(job=job, name='users', value=value) except Job.DoesNotExist: pass for url, value in google_analytics_metrics['pageviews_per_job'].items(): try: job = Job.get_by_url(url) JobMetric.create(job=job, name='pageviews', value=value) except Job.DoesNotExist: pass for url, value in google_analytics_metrics['applications_per_job'].items(): try: job = Job.get_by_url(url) JobMetric.create(job=job, name='applications', value=value) except Job.DoesNotExist: pass users_per_external_url = merge_metric_dicts( google_analytics_metrics['users_per_external_job'], mailchimp_metrics['users_per_external_url'] ) for url, value in users_per_external_url.items(): try: job = Job.get_by_link(url) JobMetric.create(job=job, name='users', value=value) except Job.DoesNotExist: pass pageviews_per_external_url = merge_metric_dicts( google_analytics_metrics['pageviews_per_external_job'], mailchimp_metrics['pageviews_per_external_url'] ) for url, value in pageviews_per_external_url.items(): try: job = Job.get_by_link(url) JobMetric.create(job=job, name='pageviews', value=value) except Job.DoesNotExist: pass
def index(): with db: jobs_count = Job.count() companies_count = Job.companies_count() return render_template('index.html', jobs_count=jobs_count, companies_count=companies_count, stories=Story.listing())
def candidate(): with db: jobs_count = Job.count() companies_count = Job.companies_count() return render_template( 'candidate.html', jobs_count=jobs_count, companies_count=companies_count, thumbnail=thumbnail(title='Příručka hledání první práce v\u00a0IT'))
def jobs(): with db: metrics = dict(**Metric.as_dict(), **Job.aggregate_metrics()) jobs = Job.listing() return render_template('jobs.html', nav_active='jobs', subnav_tabs=JOBS_SUBNAV_TABS, subnav_active='jobs', jobs=jobs, regions=REGIONS, metrics=metrics, thumbnail=thumbnail(title='Práce v\u00a0IT pro začátečníky'))
def job(job_id): with db: job = Job.get_by_id(job_id) or abort(404) jobs_count = Job.count() companies_count = Job.companies_count() return render_template('job.html', job=job, jobs_count=jobs_count, companies_count=companies_count, thumbnail=thumbnail(job_title=job.title, job_company=job.company_name, job_location=job.location))
def db(): # Using tmp file because we need to test opening and closing a db conn # here and the :memory: sqlite db ceases to exist with the conn closed tmp_file = NamedTemporaryFile(delete=False) db_path = Path(tmp_file.name) tmp_file.close() db = SqliteDatabase(tmp_file.name) with db: Job.bind(db) Job.create_table() yield db if db_path.exists(): db_path.unlink()
def job(job_id): with db: metrics = dict(**Metric.as_dict(), **Job.aggregate_metrics()) job = Job.juniorguru_get_by_id(job_id) return render_template('job.html', nav_active='jobs', subnav_tabs=JOBS_SUBNAV_TABS, subnav_active='jobs', job=job, metrics=metrics, thumbnail=thumbnail(job_title=job.title, job_company=job.company_name, job_location=job.location))
def jobs(): with db: jobs = Job.listing() jobs_count = Job.count() companies_count = Job.companies_count() jobs_bot = Job.bot_listing() return render_template( 'jobs.html', jobs=jobs, jobs_count=jobs_count, companies_count=companies_count, jobs_bot=jobs_bot, thumbnail=thumbnail(title='Práce pro začínající programátory'))
def jobs(): with db: jobs = Job.listing() jobs_count = Job.count() companies_count = Job.companies_count() jobs_bot = Job.bot_listing() return render_template( 'jobs.html', jobs=jobs, jobs_count=jobs_count, companies_count=companies_count, jobs_bot=jobs_bot, thumbnail=thumbnail(title='Práce v\u00a0IT pro začátečníky'))
def jobs_region(region_id): region = [reg for reg in REGIONS if reg['id'] == region_id][0] with db: metrics = dict(**Metric.as_dict(), **Job.aggregate_metrics()) jobs = Job.region_listing(region['name']) return render_template('jobs_region.html', nav_active='jobs', subnav_tabs=JOBS_SUBNAV_TABS, subnav_active='jobs', jobs=jobs, region=region, regions=REGIONS, metrics=metrics, thumbnail=thumbnail(title=f"Práce v\u00a0IT pro začátečníky —\u00a0{region['name']}"))
def index(): with db: metrics = Job.aggregate_metrics() return render_template('index.html', nav_tabs=None, metrics=metrics, stories=Story.listing())
def test_listing_returns_only_not_expired_jobs(db_connection): job1 = create_job('1', expires_at=None) job2 = create_job('2', expires_at=date(1987, 8, 30)) job3 = create_job('3', expires_at=date.today()) job4 = create_job('4', expires_at=date.today() + timedelta(days=2)) assert set(Job.listing()) == {job1, job4}
async def manage_jobs_channel(client): channel = await client.fetch_channel(JOBS_CHANNEL) jobs = list(Job.listing()) seen_links = set() async for message in channel.history(limit=None, after=None): for job in jobs: if job.link.rstrip('/') in message.content: log.info(f'Job {job.link} exists') seen_links.add(job.link) if message.reactions: job.upvotes_count = count_upvotes(message.reactions) job.downvotes_count = count_downvotes(message.reactions) with db: job.save() log.info(f'Saved {job.link} reactions') if DISCORD_MUTATIONS_ENABLED: new_jobs = [job for job in jobs if job.link not in seen_links] log.info(f'Posting {len(new_jobs)} new jobs') for job in new_jobs: await channel.send( f'**{job.title}**\n{job.company_name} – {job.location}\n{job.link}' ) else: log.warning( "Skipping Discord mutations, DISCORD_MUTATIONS_ENABLED not set")
def test_newsletter_listing_backfills_up_to_min_count(db_connection): job1 = create_job('1', source='moo', sort_rank=5) job2 = create_job('2', source='foo', sort_rank=1) # noqa job3 = create_job('3', source='bar', sort_rank=10) job4 = create_job('4', source='juniorguru') assert list(Job.newsletter_listing(3)) == [job4, job3, job1]
def test_database_id_prefilled(db, pipeline, item, spider): item['id'] = 'honza42' pipeline.process_item(item, spider) with db: job = Job.select()[0] assert job.id == 'honza42'
def test_database_company_logo_path(db, pipeline, item, spider): item['company_logos'] = [ { 'checksum': '6b874bd7b996e9323fd2e094be83ca4c', 'path': 'company-logos/d40730d4068db31a09687ebb42f7637e26864a30.png', 'status': 'uptodate', 'url': 'https://www.startupjobs.cz/uploads/d6e95f8c946b72f36783aa0a0238341b.png' }, { 'checksum': 'f3e2f82d7d8b24367f0a2c24b3d1aea3', 'path': 'company-logos/d1eed8447fb59dc9587dd97148a109a3cca77ed8.png', 'status': 'uptodate', 'url': 'https://www.startupjobs.cz/uploads/GQ1A8RDZWYUJfavicon155377551420.png' }, ] pipeline.process_item(item, spider) with db: job = Job.select()[0] assert job.company_logo_path == 'images/company-logos/d40730d4068db31a09687ebb42f7637e26864a30.png'
def test_count(db_connection): create_job('1', approved_at=date(1987, 8, 30)) create_job('2', approved_at=None) create_job('3', approved_at=date(1987, 8, 30)) create_job('4', approved_at=date(1987, 8, 30), expires_at=date(1987, 9, 1)) assert Job.count() == 2
def generate_messages(today): jobs = Job.juniorguru_listing() template_path = Path(__file__).parent / 'templates' / 'job_metrics.html' template = Template(template_path.read_text()) return (create_message(job, template, today) for job in jobs)
def test_juniorguru_listing(db_connection): job1 = create_job('1', source='juniorguru', sort_rank=30) job2 = create_job('2', source='moo') # noqa job3 = create_job('3', source='juniorguru', sort_rank=20) job4 = create_job('4', source='juniorguru', sort_rank=10) assert list(Job.juniorguru_listing()) == [job1, job3, job4]
def test_aggregate_metrics_companies_count(db_connection): create_job('1', company_link='https://example.com/1', source='juniorguru') create_job('2', company_link='https://example.com/2', source='juniorguru') create_job('3', company_link='https://example.com/2', source='juniorguru') create_job('4', company_link='https://example.com/3', source='xyz') assert Job.aggregate_metrics()['companies_count'] == 2
def test_remote_listing(db_connection): job1 = create_job('1', remote=True, sort_rank=30) job2 = create_job('2', remote=False) # noqa job3 = create_job('3', remote=True, sort_rank=20) job4 = create_job('4', remote=True, sort_rank=10) assert list(Job.remote_listing()) == [job1, job3, job4]
def test_database(db, pipeline, item, spider): pipeline.process_item(item, spider) with db: job = Job.select()[0] assert len(job.id) == 56 # sha224 hex digest length assert job.source == 'dummy' # spider name
def test_newsletter_listing_returns_only_jg_if_enough(db_connection): job1 = create_job('1', source='juniorguru') job2 = create_job('2', source='moo') job3 = create_job('3', source='juniorguru') job4 = create_job('4', source='juniorguru') assert list(Job.newsletter_listing(3)) == [job1, job3, job4]
async def manage_jobs_voting_channel(client): # experimenting with Mila and ML channel = await client.fetch_channel(JOBS_VOTING_CHANNEL) seen_links = set() log.info('Processing voting for jobs') jobs = list(Job.select().where(Job.magic_is_junior == False) ) # TODO PoC, move this to models or revamp models altogether? async for message in channel.history(limit=None, after=None): for job in jobs: link = job.link if link.rstrip('/') in message.content: log.info(f'Job {link} exists') seen_links.add(link) if message.reactions: job.upvotes_count += count_upvotes(message.reactions) job.downvotes_count += count_downvotes(message.reactions) with db: job.save() log.info(f'Saved {link} reactions') log.info('Processing voting for dropped jobs') jobs_dropped = list(JobDropped.select().where( JobDropped.magic_is_junior == True)) # TODO PoC, move this to models or revamp models altogether? async for message in channel.history(limit=None, after=None): for job_dropped in jobs_dropped: link = job_dropped.item['link'] if link.rstrip('/') in message.content: log.info(f'Job {link} exists') seen_links.add(link) if message.reactions: job_dropped.upvotes_count += count_upvotes( message.reactions) job_dropped.downvotes_count += count_downvotes( message.reactions) with db: job_dropped.save() log.info(f'Saved {link} reactions') if DISCORD_MUTATIONS_ENABLED: new_jobs = [job for job in jobs if job.link not in seen_links] log.info(f'Posting {len(new_jobs)} new jobs') for job in new_jobs: await channel.send( f'**{job.title}**\n{job.company_name} – {job.location}\n{job.link}' ) new_jobs_dropped = [ job_dropped for job_dropped in jobs_dropped if job_dropped.item['link'] not in seen_links ] log.info(f'Posting {len(new_jobs_dropped)} new dropped jobs') for job_dropped in new_jobs_dropped: await channel.send( f"**{job_dropped.item['title']}**\n{job_dropped.item['company_name']} – {', '.join(job_dropped.item['locations_raw'])}\n{job_dropped.item['link']}" ) else: log.warning( "Skipping Discord mutations, DISCORD_MUTATIONS_ENABLED not set")
def test_newsletter_listing_returns_only_juniorguru_if_enough(db_connection): job1 = create_job('1', source='juniorguru', sort_rank=30) job2 = create_job('2', source='moo') # noqa job3 = create_job('3', source='juniorguru', sort_rank=20) job4 = create_job('4', source='juniorguru', sort_rank=10) job5 = create_job('5', source='juniorguru', sort_rank=5) assert list(Job.newsletter_listing(3)) == [job1, job3, job4, job5]
def test_companies_count_takes_only_approved_jobs(db): create_job('1', company_link='https://abc.example.com', is_approved=True) create_job('2', company_link='https://abc.example.com', is_approved=False) create_job('3', company_link='https://xyz.example.com', is_approved=True) create_job('4', company_link='https://xyz.example.com', is_approved=False) create_job('5', company_link='https://def.example.com', is_approved=False) assert Job.companies_count() == 2
def main(): doc_key = '1TO5Yzk0-4V_RzRK5Jr9I_pF5knZsEZrNn2HKTXrHgls' records = download_sheet(doc_key, 'jobs') with db: for model in [Job, JobError, JobDropped]: model.drop_table() model.create_table() for record in records: Job.create(**coerce_record(record)) Pool().map(run_spider, [ 'linkedin', 'stackoverflow', 'startupjobs', ])
def test_database(item, spider, db): Pipeline(db=db, model=Job).process_item(item, spider) with db: job = Job.select()[0] assert len(job.id) == 56 # sha224 hex digest length assert job.source == 'dummy' # spider name assert job.is_approved is False
def operation(): job = Job.get_by_id(item.get('id') or create_id(item)) job.item = item for attr, value in response_data.items(): setattr(job, attr, value) job.save() log.debug(f"Updated job '{job.id}' with monitoring data") self.stats.inc_value('monitoring/job_saved')