Ejemplo n.º 1
0
    def start_search(self, search_term=None):
        """Starts search for job advertisements using provided search term(s). 

        HTML requests are randomly delayed by 3 to 5 seconds. 

        Arguments
        ----------
        search_term : list[str]
            Search term(s) to use. If None, instance variable search_terms,
            set during initialization, is used.
        """
        random.seed(1222)
        datab = db_controls.JobAdDB(self._db_name)
        searchables = self._search_terms if not search_term else [search_term]

        for search_term in searchables:
            print("Searching for \"%s\"." % search_term)

            # Initialize parsers
            ps = [parsers.MonsterParser(),
                  parsers.IndeedParser(),
                  parsers.DuunitoriParser(),
                  parsers.OikotieParser()]

            # Search for job ads
            loop = asyncio.get_event_loop()
            tasks = [parser.parse(search_term) for parser in ps]
            loop.run_until_complete(asyncio.gather(*tasks))

            # Save job ads in database
            for parser in ps:
                datab.store_ads(parser.get_job_ads())

            # Sleep for random time to avoid bombarding sites
            time.sleep(random.uniform(0, 1))
Ejemplo n.º 2
0
    def output_results(self, date_start, date_end, output_name, output_type):
        """Outputs job ads from database as an HTML or CSV file.

        All job ads between argument dates are included in the output.
  
        Arguments
        ----------
        date_start : :class:`datetime`
             Earliest date of job ads. If None, all job ads since the start 
             of the database are output.
        date_end : :class:`datetime`
            Latest date of job ads. If None, all job ads until end of database
            are output. If both date_start and date_end are None, all job ads 
            in the database are output.
        output_name : str
            Name of the file to output results to.
        output_type : str
            Type of output file, "csv" or "html" possible.
        """
        
        datab = db_controls.JobAdDB(self._db_name)
        print("Writing to %s from %s." % (output_name, self._db_name))
        if output_type == "html":
            datab.write_HTML_file(datab.get_ads(date_start, date_end), output_name)
        elif output_type == "csv":
            datab.write_CSV_file(datab.get_ads(date_start, date_end), output_name)
Ejemplo n.º 3
0
    def recomm_store_ads(self, JAC, language, date_start, date_end):
        """Classifies ads using provided model.
       
        All job ads between argument dates of argument language are
        classified. The results are stored under the recommendation
        column in the database.

        Arguments
        ----------
        JAC : :class:`JobAdClassification`  
            Fully set up :class:`JobAdClassification` instance.
        language : str
            Language of model and job ads to provide recommendations for.
        date_start : :class:`datetime`
            Earliest date of job ads. If None, all job ads since the start of
            the database are included.
        date_end : :class:`datetime`
            Latest date of job ads. If None, all job ads after date_start are 
            included. If both date_start and date_end are None, all job ads in 
            the database are included.
        """
        if not self._classification:
            raise EnvironmentError("Classification not enabled in JobAdCollector.")
                                      
        datab = db_controls.JobAdDB(self._db_name)

        ads = datab.get_ads(date_start, date_end, language)
        rec_ads = JAC.recommend_ads(ads)
        datab.update_ads_recommendation(rec_ads)
        datab.disconnect_db()
Ejemplo n.º 4
0
    def det_lang_store_ads(self, date_start, date_end):
        """Attempts to determine language of job ads.
       
        The languages of all job ads between argument dates are determined 
        and stored in the database. Classification has to be enabled in
        JobAdCollector instance.

        Arguments
        ----------
        date_start : :class:`datetime`
            Earliest date of job ads. If None, all job ads since the start of
            the database are included.
        date_end : :class:`datetime`
            Latest date of job ads. If None, all job ads after date_start are 
            included. If both date_start and date_end are None, all job ads in 
            the database are included.
        """
        if not self._classification:
            raise EnvironmentError("Classification not enabled in JobAdCollector.")
                                    
        datab = db_controls.JobAdDB(self._db_name)
        JAC = classification.JobAdClassification(self._Rlibpath, [], [], "")

        ads = datab.get_ads(date_start, date_end)
        lang_ads = JAC.det_lang_ads(ads)
        datab.update_ads_language(lang_ads)
        datab.disconnect_db()
Ejemplo n.º 5
0
    def train_model(self, language, date_start=datetime.datetime.strptime("01-01-2015", "%d-%m-%Y"),
                    date_end=datetime.date.today()):
        """Trains random forest model on classified job ads.

        All job ads between argument dates are included in the training. 

        Arguments
        ----------
        language : str
            Language of job ads to train on. Needed for proper stemming and 
            removal of stopwords.
        date_start : :class:`datetime`
            Earliest date of job ads. Default is start of 2015.
        date_end : :class:`datetime`
            Latest date of job ads. Default is present day. 

        Returns
        ----------
        JAC : JobAdClassification
            :class:`JobAdClassification` instance with language, model,
            search_terms and sites set.
        """
        if (self._classification == False):
            raise EnvironmentError("Classification not enabled in JobAdCollector")
        
        JAC = classification.JobAdClassification(self._Rlibpath, self._search_terms, 
                                               self._sites, language)
        datab = db_controls.JobAdDB(self._db_name)
        RFmodel = JAC.train_model(
                  datab.get_classified_ads(date_start, date_end, language, 1))
        
        datab.disconnect_db()

        return JAC
Ejemplo n.º 6
0
    def output_classified_results(self, 
                                  date_start=datetime.datetime.strptime(
                                        "01-01-2015", "%d-%m-%Y"),
                                  date_end=datetime.date.today(), language="English",
                                  output_name="class.csv", output_type="csv"):
        """Outputs classified job ads from the database as an HTML or CSV file. 

        All job ads between argument dates are included in the output.

        Arguments
        ----------
        date_start : :class:`datetime`
            Earliest date of job ads. Default is start of 2015.
        date_end : :class:`datetime`
            Latest date of job ads. Default is present day.
        language : str
            Language of classified jobs ads to output.
        output_name : str
            Name of the file to output results to. 
        output_type : str
            Type of output file, "csv" or "html".
        """
        
        datab = db_controls.JobAdDB(self._db_name)
        ads = datab.get_classified_ads(date_start, date_end, language, 1)
        print("Writing to %s from %s." % (output_name, self._db_name))
        if output_type == "html":
            datab.write_CSV_file(ads, output_name)
        elif output_type == "csv":
            datab.write_CSV_file(ads, output_name)
Ejemplo n.º 7
0
    def classify_ads_GUI(self, date_start, date_end):
        """Starts GUI for classifying database entries between given dates.
        
        All job ads between argument dates are included for classification. 

        Arguments
        ----------
        date_start : :class:`datetime`
            Earliest date of job ads. If None, all job ads since the start of
            the database are included.
        date_end : :class:`datetime`
            Latest date of job ads. If None, all job ads after date_start are 
            included. If both date_start and date_end are None, all job ads in 
            the database are included.
        """
        datab = db_controls.JobAdDB(self._db_name)

        gui = db_gui.JobAdGUI(datab.get_ads(date_start, date_end))
        gui.mainloop()
        new_data = gui.ad_storage  # dictionary with ids as keys
        new_data_dict = [JobAd.create(dict(zip(gui._db_data_columns, new_data[id])))
                         for id in new_data]
        
        datab.update_ads(new_data_dict)
Ejemplo n.º 8
0
 def setUp(self):
     self.filename = ":memory:"
     self.db = db_controls.JobAdDB(self.filename)
     self.db._connect_db()
     self.db_columns = [("site", "varchar(255)", 0),
                        ("searchterm", "varchar(255)", 0),
                        ("id", "varchar(255)", 1),
                        ("title", "varchar(255)", 0),
                        ("url", "varchar(1000)", 0),
                        ("description", "varchar(1000)", 0),
                        ("date", "date", 0),
                        ("language", "varchar(100)", 0),
                        ("relevant", "integer", 0),
                        ("recommendation", "integer", 0)]
     self.job_ads = [{
         "site": "best job ads site",
         "searchterm": "greatest jobs",
         "id": "xyz412412se",
         "title": "Great Job",
         "url": "http://www.great.zyx",
         "description": "the absolutely best job"
     }, {
         "site": "worst job ads site",
         "searchterm": "worst jobs",
         "id": "dsfewf32",
         "title": "Bad Job",
         "url": "http://www.poor.zyx",
         "description": "the absolutely worst job"
     }]
     self.job_ads = [JobAd.create(ad) for ad in self.job_ads]
     self.job_ads_stored = [{
         "site": "best job ads site",
         "searchterm": "greatest jobs",
         "id": "xyz412412se",
         "title": "Great Job",
         "url": "http://www.great.zyx",
         "description": "the absolutely best job",
         "date": datetime.date.today(),
         "language": None,
         "relevant": None,
         "recommendation": None
     }, {
         "site": "worst job ads site",
         "searchterm": "worst jobs",
         "id": "dsfewf32",
         "title": "Bad Job",
         "url": "http://www.poor.zyx",
         "description": "the absolutely worst job",
         "date": datetime.date.today(),
         "language": None,
         "relevant": None,
         "recommendation": None
     }]
     self.job_ads_stored = [JobAd.create(ad) for ad in self.job_ads_stored]
     self.job_ads_garbage = [{
         "site": "best job ads site",
         "searchterm": "greatest jobs",
         "id": "xyz412412se",
         "title": "Great Job",
         "url": "http://www.great.zyx",
         "description": "the absolutely best job",
         "falsekey": "garbage"
     }, {
         "site": "worst job ads site",
         "searchterm": "worst jobs",
         "id": "dsfewf32",
         "title": "Bad Job",
         "url": "http://www.poor.zyx",
         "description": "the absolutely worst job",
         "falsekey": "garbage"
     }]
     self.job_ads_garbage = [
         JobAd.create(ad) for ad in self.job_ads_garbage
     ]
     self.job_ads_classified = [{
         "site": "best job ads site",
         "searchterm": "greatest jobs",
         "id": "xyz412412se",
         "title": "Great Job",
         "url": "http://www.great.zyx",
         "description": "the absolutely best job",
         "date": datetime.date.today(),
         "language": "English",
         "relevant": 1,
         "recommendation": None
     }, {
         "site": "worst job ads site",
         "searchterm": "worst jobs",
         "id": "dsfewf32",
         "title": "Bad Job",
         "url": "http://www.poor.zyx",
         "description": "the absolutely worst job",
         "date": datetime.date.today(),
         "language": "English",
         "relevant": 0,
         "recommendation": None
     }]
     self.job_ads_classified = [
         JobAd.create(ad) for ad in self.job_ads_classified
     ]
     self.job_ads_classified_less = [{
         "site": "best job ads site",
         "searchterm": "greatest jobs",
         "title": "Great Job",
         "description": "the absolutely best job",
         "language": "English",
         "relevant": 1
     }, {
         "site": "worst job ads site",
         "searchterm": "worst jobs",
         "title": "Bad Job",
         "description": "the absolutely worst job",
         "language": "English",
         "relevant": 0
     }]
     self.job_ads_classified_less = [
         JobAd.create(ad) for ad in self.job_ads_classified_less
     ]