def __init__(self, config: JobFunnelConfigManager) -> None: """Initialize a JobFunnel object, with a JobFunnel Config Args: config (JobFunnelConfigManager): config object containing paths etc. """ config.validate() # NOTE: this ensures log file path exists super().__init__(level=config.log_level, file_path=config.log_file) self.config = config self.__date_string = date.today().strftime("%Y-%m-%d") self.master_jobs_dict = {} # type: Dict[str, Job] # Open a session with/out a proxy configured self.session = Session() if self.config.proxy_config: self.session.proxies = { self.config.proxy_config.protocol: self.config.proxy_config.url } # Read the user's block list user_block_jobs_dict = {} # type: Dict[str, str] if os.path.isfile(self.config.user_block_list_file): user_block_jobs_dict = json.load( open(self.config.user_block_list_file, 'r') ) # Read the user's duplicate jobs list (from TFIDF) duplicate_jobs_dict = {} # type: Dict[str, str] if os.path.isfile(self.config.duplicates_list_file): duplicate_jobs_dict = json.load( open(self.config.duplicates_list_file, 'r') ) # Initialize our job filter self.job_filter = JobFilter( user_block_jobs_dict, duplicate_jobs_dict, self.config.search_config.blocked_company_names, T_NOW - timedelta(days=self.config.search_config.max_listing_days), desired_remoteness=self.config.search_config.remoteness, log_level=self.config.log_level, log_file=self.config.log_file, )
class JobFunnel(Logger): """Class that initializes a Scraper and scrapes a website to get jobs """ def __init__(self, config: JobFunnelConfigManager) -> None: """Initialize a JobFunnel object, with a JobFunnel Config Args: config (JobFunnelConfigManager): config object containing paths etc. """ config.validate() # NOTE: this ensures log file path exists super().__init__(level=config.log_level, file_path=config.log_file) self.config = config self.__date_string = date.today().strftime("%Y-%m-%d") self.master_jobs_dict = {} # type: Dict[str, Job] # Open a session with/out a proxy configured self.session = Session() if self.config.proxy_config: self.session.proxies = { self.config.proxy_config.protocol: self.config.proxy_config.url } # Read the user's block list user_block_jobs_dict = {} # type: Dict[str, str] if os.path.isfile(self.config.user_block_list_file): user_block_jobs_dict = json.load( open(self.config.user_block_list_file, 'r') ) # Read the user's duplicate jobs list (from TFIDF) duplicate_jobs_dict = {} # type: Dict[str, str] if os.path.isfile(self.config.duplicates_list_file): duplicate_jobs_dict = json.load( open(self.config.duplicates_list_file, 'r') ) # Initialize our job filter self.job_filter = JobFilter( user_block_jobs_dict, duplicate_jobs_dict, self.config.search_config.blocked_company_names, T_NOW - timedelta(days=self.config.search_config.max_listing_days), desired_remoteness=self.config.search_config.remoteness, log_level=self.config.log_level, log_file=self.config.log_file, ) @property def daily_cache_file(self) -> str: """The name for for pickle file containing the scraped data ran today' TODO: instead of using a 'daily' cache file, we should be tying this into the search that was made to prevent cross-caching results. """ return os.path.join( self.config.cache_folder, f"jobs_{self.__date_string}.pkl", ) def run(self) -> None: """Scrape, update lists and save to CSV. """ # Read the master CSV file if os.path.isfile(self.config.master_csv_file): self.master_jobs_dict = self.read_master_csv() # Load master csv jobs if they exist and update our block list with # any jobs the user has set the status to == a remove status # NOTE: we want to do this first to make our filters use current info. if self.master_jobs_dict: self.update_user_block_list() else: self.logger.debug( "No master-CSV present, did not update block-list: %s", self.config.user_block_list_file ) # Scrape jobs or load them from a cache if one exists (--no-scrape) scraped_jobs_dict = {} # type: Dict[str, Job] if self.config.no_scrape: # Load cache since --no-scrape is set self.logger.info("Skipping scraping, running with --no-scrape.") if os.path.exists(self.daily_cache_file): scraped_jobs_dict = self.load_cache(self.daily_cache_file) else: self.logger.warning( "No incoming jobs, missing cache: %s", self.daily_cache_file ) else: # Scrape new jobs from all our configured providers and cache them scraped_jobs_dict = self.scrape() # Filter out any jobs we have rejected, archived or block-listed # NOTE: we do not remove duplicates here as these may trigger updates if scraped_jobs_dict: self.write_cache(scraped_jobs_dict) scraped_jobs_dict = self.job_filter.filter( scraped_jobs_dict, remove_existing_duplicate_keys=False ) if self.master_jobs_dict: self.master_jobs_dict = self.job_filter.filter( self.master_jobs_dict, remove_existing_duplicate_keys=False, ) # Parse duplicate jobs into updates for master jobs dict # NOTE: we prevent inter-scrape duplicates by key-id within BaseScraper # FIXME: impl. TFIDF on inter-scrape duplicates duplicate_jobs = [] # type: List[DuplicatedJob] if self.master_jobs_dict and scraped_jobs_dict: # Remove jobs with duplicated key_ids from scrape + update master duplicate_jobs = self.job_filter.find_duplicates( self.master_jobs_dict, scraped_jobs_dict, ) for match in duplicate_jobs: # Was it a key-id match? if match.type in [DuplicateType.KEY_ID or DuplicateType.EXISTING_TFIDF]: # NOTE: original and duplicate have same key id for these. # When it's EXISTING_TFIDF, we can't set match.duplicate # because it is only partially stored in the block list JSON if match.original.key_id and (match.original.key_id != match.duplicate.key_id): raise ValueError( "Found duplicate by key-id, but keys dont match! " f"{match.original.key_id}, {match.duplicate.key_id}" ) # Got a key-id match, pop from scrape dict and maybe update upd = self.master_jobs_dict[ match.duplicate.key_id].update_if_newer( scraped_jobs_dict.pop(match.duplicate.key_id)) self.logger.debug( "Identified duplicate %s by key-id and %s original job " "with its data.", match.duplicate.key_id, 'updated older' if upd else 'did not update', ) # Was it a content-match? elif match.type == DuplicateType.NEW_TFIDF: # Got a content match, pop from scrape dict and maybe update upd = self.master_jobs_dict[ match.original.key_id].update_if_newer( scraped_jobs_dict.pop(match.duplicate.key_id) ) self.logger.debug( "Identified %s as a duplicate by description and %s " "original job %s with its data.", match.duplicate.key_id, 'updated older' if upd else 'did not update', match.original.key_id, ) # Update duplicates file (if any updates are incoming) if duplicate_jobs: self.update_duplicates_file() # Update master jobs dict with the incoming jobs that passed filters if scraped_jobs_dict: self.master_jobs_dict.update(scraped_jobs_dict) # Write-out to CSV or log messages if self.master_jobs_dict: # Write our updated jobs out (if none, dont make the file at all) self.write_master_csv(self.master_jobs_dict) self.logger.info( "Done. View your current jobs in %s", self.config.master_csv_file ) else: # We got no new, unique jobs. This is normal if loading scrape # with --no-scrape as all jobs are removed by duplicate filter if self.config.no_scrape: # User is running --no-scrape probably just to update lists self.logger.debug("No new jobs were added.") else: self.logger.warning("No new jobs were added to CSV.") def _check_for_inter_scraper_validity(self, existing_jobs: Dict[str, Job], incoming_jobs: Dict[str, Job], ) -> None: """Verify that we aren't overwriting jobs by key-id between scrapers NOTE: this is a slow check, would be cool to improve the O(n) on this """ existing_job_keys = existing_jobs.keys() for inc_key_id in incoming_jobs.keys(): for exist_key_id in existing_job_keys: if inc_key_id == exist_key_id: raise ValueError( f"Inter-scraper key-id duplicate! {exist_key_id}" ) def scrape(self) -> Dict[str, Job]: """Run each of the desired Scraper.scrape() with threading and delaying """ self.logger.info( "Scraping local providers with: %s", self.config.scraper_names ) # Iterate thru scrapers and run their scrape. jobs = {} # type: Dict[str, Job] for scraper_cls in self.config.scrapers: incoming_jobs_dict = {} start = time() scraper = scraper_cls(self.session, self.config, self.job_filter) try: incoming_jobs_dict = scraper.scrape() except Exception as e: self.logger.error(f"Failed to scrape jobs for {scraper_cls.__name__}") # Ensure we have no duplicates between our scrapers by key-id # (since we are updating the jobs dict with results) self._check_for_inter_scraper_validity( jobs, incoming_jobs_dict, ) jobs.update(incoming_jobs_dict) end = time() self.logger.debug( "Scraped %d jobs from %s, took %.3fs", len(jobs.items()), scraper_cls.__name__, (end - start), ) self.logger.info( "Completed all scraping, found %d new jobs.", len(jobs) ) return jobs def recover(self) -> None: """Build a new master CSV from all the available pickles in our cache """ self.logger.info("Recovering jobs from all cache files in cache folder") if os.path.exists(self.config.user_block_list_file): self.logger.warning( "Running recovery mode, but with existing block-list, delete " "%s if you want to start fresh from the cached data and not " "filter any jobs away.", self.config.user_block_list_file ) all_jobs_dict = {} # type: Dict[str, Job] for file in os.listdir(self.config.cache_folder): if '.pkl' in file: all_jobs_dict.update( self.load_cache( os.path.join(self.config.cache_folder, file) ) ) self.write_master_csv(self.job_filter.filter(all_jobs_dict)) def load_cache(self, cache_file: str) -> Dict[str, Job]: """Load today's scrape data from pickle via date string TODO: search the cache for pickles that match search config. (we may need a registry for the pickles and seach terms used) Args: cache_file (str): path to cache pickle file containing jobs dict keyed by Job.KEY_ID. Raises: FileNotFoundError: if cache file is missing Returns: Dict[str, Job]: [description] """ if not os.path.exists(cache_file): raise FileNotFoundError( f"{cache_file} not found! Have you scraped any jobs today?" ) else: cache_dict = pickle.load(open(cache_file, 'rb')) jobs_dict = cache_dict['jobs_dict'] version = cache_dict['version'] if version != __version__: # NOTE: this may be an error in the future self.logger.warning( "Loaded jobs cache has version mismatch! " "cache version: %s, current version: %s", version, __version__ ) self.logger.info( "Read %d jobs from previously-scraped jobs cache: %s.", len(jobs_dict.keys()), cache_file, ) self.logger.debug( "NOTE: you may see many duplicate IDs detected if these jobs " "exist in your master CSV already." ) return jobs_dict def write_cache(self, jobs_dict: Dict[str, Job], cache_file: str = None) -> None: """Dump a jobs_dict into a pickle TODO: write search_config into the cache file and jobfunnel version TODO: some way to cache Job.RAW without hitting recursion limit Args: jobs_dict (Dict[str, Job]): jobs dict to dump into cache. cache_file (str, optional): file path to write to. Defaults to None. """ cache_file = cache_file if cache_file else self.daily_cache_file for job in jobs_dict.values(): job._raw_scrape_data = None # pylint: disable=protected-access pickle.dump( { 'version': __version__, 'jobs_dict': jobs_dict, }, open(cache_file, 'wb'), ) self.logger.debug( "Dumped %d jobs to %s", len(jobs_dict.keys()), cache_file ) def read_master_csv(self) -> Dict[str, Job]: """Read in the master-list CSV to a dict of unique Jobs TODO: make blurb --> description and add short_description Returns: Dict[str, Job]: unique Job objects in the CSV """ jobs_dict = {} # type: Dict[str, Job] with open(self.config.master_csv_file, 'r', encoding='utf8', errors='ignore') as csvfile: for row in csv.DictReader(csvfile): # NOTE: we are doing legacy support here with 'blurb' etc. # In the future we should have an actual short description if 'short_description' in row: short_description = row['short_description'] else: short_description = '' post_date = datetime.strptime(row['date'], '%Y-%m-%d') if 'scrape_date' in row: scrape_date = datetime.strptime( row['scrape_date'], '%Y-%m-%d' ) else: scrape_date = post_date if 'raw' in row: # NOTE: we should never see this because raw cant be in CSV raw = row['raw'] else: raw = None # FIXME: this is the wrong way to compare row val to Enum.name! # We need to convert from user statuses status = None if 'status' in row: status_str = row['status'].strip() for p_status in JobStatus: if status_str.lower() == p_status.name.lower(): status = p_status break if not status: self.logger.warning( "Unknown status %s, setting to UNKNOWN", status_str ) status = JobStatus.UNKNOWN # NOTE: this is for legacy support: locale = None if 'locale' in row: locale_str = row['locale'].strip() for p_locale in Locale: if locale_str.lower() == p_locale.name.lower(): locale = p_locale break if not locale: self.logger.warning( "Unknown locale %s, setting to UNKNOWN", locale_str ) locale = locale.UNKNOWN # Check for remoteness (handle if not present for legacy) remoteness = Remoteness.UNKNOWN if 'remoteness' in row: remote_str = row['remoteness'].strip() remoteness = Remoteness[remote_str] if not locale: self.logger.warning( "Unknown locale %s, setting to UNKNOWN", locale_str ) locale = locale.UNKNOWN # Check for wage (handle if not present for legacy wage = '' if 'wage' in row: wage = row['wage'].strip() job = Job( title=row['title'], company=row['company'], location=row['location'], description=row['blurb'], key_id=row['id'], url=row['link'], locale=locale, query=row['query'], status=status, provider=row['provider'], short_description=short_description, post_date=post_date, scrape_date=scrape_date, raw=raw, tags=row['tags'].split(','), remoteness=remoteness, ) job.validate() jobs_dict[job.key_id] = job self.logger.debug( "Read %d jobs from master-CSV: %s", len(jobs_dict.keys()), self.config.master_csv_file ) return jobs_dict def write_master_csv(self, jobs: Dict[str, Job]) -> None: """Write out our dict of unique Jobs to a CSV Args: jobs (Dict[str, Job]): Dict of unique Jobs, keyd by unique id's """ with open(self.config.master_csv_file, 'w', encoding='utf8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=CSV_HEADER) writer.writeheader() for job in jobs.values(): job.validate() writer.writerow(job.as_row) self.logger.debug( "Wrote %d jobs to %s", len(jobs), self.config.master_csv_file, ) def update_user_block_list(self) -> None: """From data in master CSV file, add jobs with removeable statuses to our configured user block list file and save (if any) NOTE: adding jobs to block list will result in filter() removing them from all scraped & cached jobs in the future (persistant). Raises: FileNotFoundError: if no master_jobs_dict is provided and master csv file does not exist. """ # Try to load from CSV if master_jobs_dict is un-set if not self.master_jobs_dict: if os.path.isfile(self.config.master_csv_file): self.master_jobs_dict = self.read_master_csv() else: raise FileNotFoundError( f"Cannot update {self.config.user_block_list_file} without " f"{self.config.master_csv_file}" ) # Add jobs from csv that need to be filtered away, if any + update self n_jobs_added = 0 for job in self.master_jobs_dict.values(): if job.is_remove_status: if job.key_id not in self.job_filter.user_block_jobs_dict: n_jobs_added += 1 self.job_filter.user_block_jobs_dict[ job.key_id] = job.as_json_entry self.logger.debug( "Added %s to %s", job.key_id, self.config.user_block_list_file ) else: # This could happen if we are somehow mishandling block list self.logger.warning( "Job %s has been set to a removable status and removed " "from master CSV multiple times.", job.key_id ) if n_jobs_added: # Write out complete list with any additions from the masterlist # NOTE: we use indent=4 so that it stays human-readable. with open(self.config.user_block_list_file, 'w', encoding='utf8') as outfile: outfile.write( json.dumps( self.job_filter.user_block_jobs_dict, indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False, ) ) self.logger.info( "Moved %d jobs into block-list due to removable statuses: %s", n_jobs_added, self.config.user_block_list_file ) def update_duplicates_file(self) -> None: """Update duplicates filter file if we have a path and contents TODO: this should be writing out DuplicatedJob objects and a version so that we retain links to original jobs. """ if self.config.duplicates_list_file: if self.job_filter.duplicate_jobs_dict: # Write out the changes NOTE: indent=4 is for human-readability self.logger.debug("Extending existing duplicate jobs dict.") with open(self.config.duplicates_list_file, 'w', encoding='utf8') as outfile: outfile.write( json.dumps( self.job_filter.duplicate_jobs_dict, indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False, ) ) else: self.logger.debug( "Current duplicate jobs dict is empty, no updates written." ) else: self.logger.warning( "Duplicates will not be saved, no duplicates list " "file set. Saving to a duplicates file will ensure " "that jobs detected to be duplicates by contents persist." )