def start_search(self, search_term=None): """Starts search for job advertisements using provided search term(s). HTML requests are randomly delayed by 3 to 5 seconds. Arguments ---------- search_term : list[str] Search term(s) to use. If None, instance variable search_terms, set during initialization, is used. """ random.seed(1222) datab = db_controls.JobAdDB(self._db_name) searchables = self._search_terms if not search_term else [search_term] for search_term in searchables: print("Searching for \"%s\"." % search_term) # Initialize parsers ps = [parsers.MonsterParser(), parsers.IndeedParser(), parsers.DuunitoriParser(), parsers.OikotieParser()] # Search for job ads loop = asyncio.get_event_loop() tasks = [parser.parse(search_term) for parser in ps] loop.run_until_complete(asyncio.gather(*tasks)) # Save job ads in database for parser in ps: datab.store_ads(parser.get_job_ads()) # Sleep for random time to avoid bombarding sites time.sleep(random.uniform(0, 1))
def output_results(self, date_start, date_end, output_name, output_type): """Outputs job ads from database as an HTML or CSV file. All job ads between argument dates are included in the output. Arguments ---------- date_start : :class:`datetime` Earliest date of job ads. If None, all job ads since the start of the database are output. date_end : :class:`datetime` Latest date of job ads. If None, all job ads until end of database are output. If both date_start and date_end are None, all job ads in the database are output. output_name : str Name of the file to output results to. output_type : str Type of output file, "csv" or "html" possible. """ datab = db_controls.JobAdDB(self._db_name) print("Writing to %s from %s." % (output_name, self._db_name)) if output_type == "html": datab.write_HTML_file(datab.get_ads(date_start, date_end), output_name) elif output_type == "csv": datab.write_CSV_file(datab.get_ads(date_start, date_end), output_name)
def recomm_store_ads(self, JAC, language, date_start, date_end): """Classifies ads using provided model. All job ads between argument dates of argument language are classified. The results are stored under the recommendation column in the database. Arguments ---------- JAC : :class:`JobAdClassification` Fully set up :class:`JobAdClassification` instance. language : str Language of model and job ads to provide recommendations for. date_start : :class:`datetime` Earliest date of job ads. If None, all job ads since the start of the database are included. date_end : :class:`datetime` Latest date of job ads. If None, all job ads after date_start are included. If both date_start and date_end are None, all job ads in the database are included. """ if not self._classification: raise EnvironmentError("Classification not enabled in JobAdCollector.") datab = db_controls.JobAdDB(self._db_name) ads = datab.get_ads(date_start, date_end, language) rec_ads = JAC.recommend_ads(ads) datab.update_ads_recommendation(rec_ads) datab.disconnect_db()
def det_lang_store_ads(self, date_start, date_end): """Attempts to determine language of job ads. The languages of all job ads between argument dates are determined and stored in the database. Classification has to be enabled in JobAdCollector instance. Arguments ---------- date_start : :class:`datetime` Earliest date of job ads. If None, all job ads since the start of the database are included. date_end : :class:`datetime` Latest date of job ads. If None, all job ads after date_start are included. If both date_start and date_end are None, all job ads in the database are included. """ if not self._classification: raise EnvironmentError("Classification not enabled in JobAdCollector.") datab = db_controls.JobAdDB(self._db_name) JAC = classification.JobAdClassification(self._Rlibpath, [], [], "") ads = datab.get_ads(date_start, date_end) lang_ads = JAC.det_lang_ads(ads) datab.update_ads_language(lang_ads) datab.disconnect_db()
def train_model(self, language, date_start=datetime.datetime.strptime("01-01-2015", "%d-%m-%Y"), date_end=datetime.date.today()): """Trains random forest model on classified job ads. All job ads between argument dates are included in the training. Arguments ---------- language : str Language of job ads to train on. Needed for proper stemming and removal of stopwords. date_start : :class:`datetime` Earliest date of job ads. Default is start of 2015. date_end : :class:`datetime` Latest date of job ads. Default is present day. Returns ---------- JAC : JobAdClassification :class:`JobAdClassification` instance with language, model, search_terms and sites set. """ if (self._classification == False): raise EnvironmentError("Classification not enabled in JobAdCollector") JAC = classification.JobAdClassification(self._Rlibpath, self._search_terms, self._sites, language) datab = db_controls.JobAdDB(self._db_name) RFmodel = JAC.train_model( datab.get_classified_ads(date_start, date_end, language, 1)) datab.disconnect_db() return JAC
def output_classified_results(self, date_start=datetime.datetime.strptime( "01-01-2015", "%d-%m-%Y"), date_end=datetime.date.today(), language="English", output_name="class.csv", output_type="csv"): """Outputs classified job ads from the database as an HTML or CSV file. All job ads between argument dates are included in the output. Arguments ---------- date_start : :class:`datetime` Earliest date of job ads. Default is start of 2015. date_end : :class:`datetime` Latest date of job ads. Default is present day. language : str Language of classified jobs ads to output. output_name : str Name of the file to output results to. output_type : str Type of output file, "csv" or "html". """ datab = db_controls.JobAdDB(self._db_name) ads = datab.get_classified_ads(date_start, date_end, language, 1) print("Writing to %s from %s." % (output_name, self._db_name)) if output_type == "html": datab.write_CSV_file(ads, output_name) elif output_type == "csv": datab.write_CSV_file(ads, output_name)
def classify_ads_GUI(self, date_start, date_end): """Starts GUI for classifying database entries between given dates. All job ads between argument dates are included for classification. Arguments ---------- date_start : :class:`datetime` Earliest date of job ads. If None, all job ads since the start of the database are included. date_end : :class:`datetime` Latest date of job ads. If None, all job ads after date_start are included. If both date_start and date_end are None, all job ads in the database are included. """ datab = db_controls.JobAdDB(self._db_name) gui = db_gui.JobAdGUI(datab.get_ads(date_start, date_end)) gui.mainloop() new_data = gui.ad_storage # dictionary with ids as keys new_data_dict = [JobAd.create(dict(zip(gui._db_data_columns, new_data[id]))) for id in new_data] datab.update_ads(new_data_dict)
def setUp(self): self.filename = ":memory:" self.db = db_controls.JobAdDB(self.filename) self.db._connect_db() self.db_columns = [("site", "varchar(255)", 0), ("searchterm", "varchar(255)", 0), ("id", "varchar(255)", 1), ("title", "varchar(255)", 0), ("url", "varchar(1000)", 0), ("description", "varchar(1000)", 0), ("date", "date", 0), ("language", "varchar(100)", 0), ("relevant", "integer", 0), ("recommendation", "integer", 0)] self.job_ads = [{ "site": "best job ads site", "searchterm": "greatest jobs", "id": "xyz412412se", "title": "Great Job", "url": "http://www.great.zyx", "description": "the absolutely best job" }, { "site": "worst job ads site", "searchterm": "worst jobs", "id": "dsfewf32", "title": "Bad Job", "url": "http://www.poor.zyx", "description": "the absolutely worst job" }] self.job_ads = [JobAd.create(ad) for ad in self.job_ads] self.job_ads_stored = [{ "site": "best job ads site", "searchterm": "greatest jobs", "id": "xyz412412se", "title": "Great Job", "url": "http://www.great.zyx", "description": "the absolutely best job", "date": datetime.date.today(), "language": None, "relevant": None, "recommendation": None }, { "site": "worst job ads site", "searchterm": "worst jobs", "id": "dsfewf32", "title": "Bad Job", "url": "http://www.poor.zyx", "description": "the absolutely worst job", "date": datetime.date.today(), "language": None, "relevant": None, "recommendation": None }] self.job_ads_stored = [JobAd.create(ad) for ad in self.job_ads_stored] self.job_ads_garbage = [{ "site": "best job ads site", "searchterm": "greatest jobs", "id": "xyz412412se", "title": "Great Job", "url": "http://www.great.zyx", "description": "the absolutely best job", "falsekey": "garbage" }, { "site": "worst job ads site", "searchterm": "worst jobs", "id": "dsfewf32", "title": "Bad Job", "url": "http://www.poor.zyx", "description": "the absolutely worst job", "falsekey": "garbage" }] self.job_ads_garbage = [ JobAd.create(ad) for ad in self.job_ads_garbage ] self.job_ads_classified = [{ "site": "best job ads site", "searchterm": "greatest jobs", "id": "xyz412412se", "title": "Great Job", "url": "http://www.great.zyx", "description": "the absolutely best job", "date": datetime.date.today(), "language": "English", "relevant": 1, "recommendation": None }, { "site": "worst job ads site", "searchterm": "worst jobs", "id": "dsfewf32", "title": "Bad Job", "url": "http://www.poor.zyx", "description": "the absolutely worst job", "date": datetime.date.today(), "language": "English", "relevant": 0, "recommendation": None }] self.job_ads_classified = [ JobAd.create(ad) for ad in self.job_ads_classified ] self.job_ads_classified_less = [{ "site": "best job ads site", "searchterm": "greatest jobs", "title": "Great Job", "description": "the absolutely best job", "language": "English", "relevant": 1 }, { "site": "worst job ads site", "searchterm": "worst jobs", "title": "Bad Job", "description": "the absolutely worst job", "language": "English", "relevant": 0 }] self.job_ads_classified_less = [ JobAd.create(ad) for ad in self.job_ads_classified_less ]