class ProfesiaJobOffersParser(BaseJobOfferParser): def __init__(self): super().__init__() self.logger = Logger(self.__class__.__name__) self.weburl = 'https://www.profesia.sk' def parse(self, url, verbose=False): # Get the parsed HTML content of the input URL parsed = self.content(url) # Get the job offers offers = self.offers(parsed) if verbose: pprint(offers) return offers def content(self, url): return self.parser.parse(url) def offers(self, parsed_html): # Validate the input data if not parsed_html: return None # Parse the HTML for job offers offers = parsed_html.find_all('li', class_='list-row') if not offers: self.logger.warning( 'No <li> tags with job offers found. Returning None') return None # Parse the job offers result = [] for offer in offers: if offer.find('h2'): header = offer.find('h2').find('a', href=True, text=True) result.append({ 'url': '/'.join((self.weburl, header['href'])), 'txt': header.text, 'emp': offer.find('span', class_='employer'), 'loc': offer.find('span', class_='job-location') }) return result
class KarieraJobOffersParser(BaseJobOfferParser): def __init__(self): super().__init__() self.logger = Logger(self.__class__.__name__) self.weburl = 'https://kariera.sk' def parse(self, url, verbose=False): # Get the parsed HTML content of the input URL parsed = self.content(url) # Get the job offers offers = self.offers(parsed) if verbose: pprint(offers) return offers def content(self, url): return self.parser.parse(url) def offers(self, parsed_html): # Validate the input data if not parsed_html: return None # Parse the HTML for job offers offers = parsed_html.find_all('div', class_='column2 offer-list-info') if not offers: self.logger.warning('No <div> tags with job offers found. Returning None') return None # Parse the job offers result = [] for offer in offers: if offer.find('h2'): header = offer.find('h2').find('a', href=True, text=True) result.append( {'url': header['href'], 'txt': header.text, 'emp': offer.find('a', class_='employer', href=True, text=True).text, 'loc': offer.find('span', class_='place').text} ) return result
def __init__(self): super().__init__() # Prepare the logger and the web URL self.logger = Logger(self.__class__.__name__) self.weburl = 'https://www.nehnutelnosti.sk' # Prepare the offer data extractor self.extractor = NehnutelnostiPropertyOffersDataExtractor()
class BaseWebContentParser(object): HTML_PARSER_TYPE = '' HTML_PARSER_NAME = '' BASE_URL_STRING = '' REQUEST_TIMEOUT = 10 REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/61.0.3163.100 ' 'Safari/537.36' } def __init__(self): super().__init__() self.parser = None self.logger = Logger(self.__class__.__name__) def parse(self, url): return self.get_valid_parsed_object( self.get_valid_response(self.get_valid_url(url))) def get_valid_url(self, url=None): if not url: return None else: url = self.get_url(url) if not self.is_valid_url(url): self.logger.warning( f'Invalid URL {url} encountered. Returning None') return None return url def get_valid_response(self, url=None): if not url: return None else: response = self.get_response(url) if not self.is_valid_response(response): self.logger.warning( f'Status code {response.status_code}. Returning None') return None return response def get_valid_parsed_object(self, response): if not response or not self.is_valid_response(response): return None else: parsed_object = self.get_parsed_object(response) if not self.is_valid_parsed_object(parsed_object): self.logger.warning( 'HTML parser was not initialized. Returning None') return None return parsed_object def get_url(self, url): return str(url) if url else self.BASE_URL_STRING def get_response(self, url): return requests.get(url, headers=self.REQUEST_HEADERS, timeout=self.REQUEST_TIMEOUT) def get_parsed_object(self, response): return self.parser(response.content, self.HTML_PARSER_TYPE) if self.parser else None @staticmethod def is_valid_url(url): return url and (validators.url(url) is True) @staticmethod def is_valid_response(response): return response.status_code == requests.codes.ok @staticmethod def is_valid_parsed_object(parsed_object): return parsed_object if parsed_object else False
def __init__(self): super().__init__() self.parser = None self.logger = Logger(self.__class__.__name__)
def __init__(self): super().__init__() self.parser = BeautifulSoup self.logger = Logger(self.__class__.__name__)
def cross_validate_classifier(X, y, model, threshold=0.5, metrics=("mcc", "acc", "sen", "spe"), num_folds=10, num_repetitions=20, seed=42, logger=None): """ Cross-validate the binary classification model This function cross-validates the input classification model using X, y. The cross-validation is set by num_folds and num_repetitions. After the model is cross-validated, several metrics are computed. Parameters ---------- X : numpy array 2D feature matrix (rows=observations, cols=features) y : numpy array 1D labels array model : class that implements fit, and predict methods Initialized binary classification model threshold : float, optional, default 0.5 Threshold for encoding the predicted probability as a class label metrics : tuple, optional, default ("mcc", "acc", "sen", "spe") Tuple with classification metrics to compute num_folds : int, optional, default 10 Number of cross-validation folds num_repetitions : int, optional, default 20 Number of cross-validation runs seed : int, optional, default 42 Random generator seed logger : Logger, optional, default None Logger class Returns ------- Default dictionary with keys=metric names, vals=metric arrays Raises ------ TypeError Raised when X or y is not an instance of np.ndarray ValueError Raised when X and y have not the same number of rows (observations) """ # Prepare the logger logger = logger if logger else Logger(inspect.currentframe().f_code.co_name) # Prepare the results table for the cross-validation results table_cv_data = defaultdict(list) # Run the desired number of cross-validation repetitions for repetition in range(num_repetitions): for train_i, test_i in StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True).split(X, y): # Split the data to train, test sets X_train, X_test = X[train_i], X[test_i] y_train, y_test = y[train_i], y[test_i] try: # fit the classifier model.fit(X_train, y_train) # Evaluate the classifier predicted = model.predict(X_test) # Encode the labels y_true = np.array(y_test, dtype=np.int16) y_pred = np.array([0 if y_hat < threshold else 1 for y_hat in predicted], dtype=np.int16) # Compute the classification metrics for metric in metrics: computed = classification_metric(metric, y_true, y_pred) computed = computed if computed and np.isfinite(computed) else None if computed: table_cv_data[metric].append(computed) except Exception as e: if "Input contains NaN, infinity or a value too large" in str(e): logger.warning("Poor performance detected, skipping current validation fold") continue else: logger.exception(e) return table_cv_data
def __init__(self): super().__init__() self.logger = Logger(self.__class__.__name__) self.weburl = 'https://kariera.sk'
def __init__(self): super().__init__() self.logger = Logger(self.__class__.__name__) self.weburl = 'https://www.profesia.sk'
def cross_validate_regressor(X, y, model, metrics=("mae", "mse", "rmse", "eer"), num_folds=10, num_repetitions=20, seed=42, logger=None): """ Cross-validate the regression model This function cross-validates the input regression model using X, y. The cross-validation is set by num_folds and num_repetitions. After the model is cross-validated, several metrics are computed. Parameters ---------- X : numpy array 2D feature matrix (rows=observations, cols=features) y : numpy array 1D labels array model : class that implements fit, and predict methods Initialized regression model metrics : tuple, optional, default ("mae", "mse", "rmse", "eer") Tuple with regression metrics to compute num_folds : int, optional, default 10 Number of cross-validation folds num_repetitions : int, optional, default 20 Number of cross-validation runs seed : int, optional, default 42 Random generator seed logger : Logger, optional, default None Logger class Returns ------- Default dictionary with keys=metric names, vals=metric arrays Raises ------ TypeError Raised when X or y is not an instance of np.ndarray ValueError Raised when X and y have not the same number of rows (observations) """ # Prepare the logger logger = logger if logger else Logger( inspect.currentframe().f_code.co_name) # Prepare the results table for the cross-validation results table_cv_data = defaultdict(list) # Run the desired number of cross-validation repetitions for repetition in range(num_repetitions): # Prepare accumulator for predictions y_pred_accumulator = np.zeros(y.shape) # Permute the data prior to processing p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] for train_i, test_i in KFold(n_splits=num_folds, random_state=seed, shuffle=True).split(X): # Split the data to train, test sets X_train, X_test = X[train_i], X[test_i] y_train, y_test = y[train_i], y[test_i] try: # fit the regressor model.fit(X_train, y_train) # Evaluate the regressor y_pred_accumulator[test_i] = model.predict(X_test) except Exception as e: if "Input contains NaN, infinity or a value too large" in str( e): logger.warning( "Poor performance detected, skipping current validation fold" ) continue else: logger.exception(e) # Encode the labels y_true = np.array(y, dtype=np.int16) y_pred = np.array(y_pred_accumulator, dtype=np.int16) # Compute the regression metrics for metric in metrics: computed = regression_metric(metric, y_true, y_pred, 1 if X.ndim == 1 else X.shape[1]) computed = computed if computed and np.isfinite(computed) else None if computed: table_cv_data[metric].append(computed) return table_cv_data
def search_over_observations(X, y, model, to_remove=(0, 1, 2), to_score="mcc", threshold=0.5, metrics=("mcc", "acc", "sen", "spe"), num_folds=10, num_repetitions=20, seed=42, logger=None, verbose=False): """ Search for the best sub-set of observations This function searches for the sub-sample of observations that yield the best classification performance by iteratively sub-sampling combinations from the input feature matrix <X> according to <to_remove>. If for instance it is set to (0, 1, 2, ...), the function: - uses all observations - uses all combinations of S.shape[0] - 1 observations - uses all combinations of S.shape[0] - 2 observations - etc. It returns a list of dicts. In each dict, it holds the exact indices used to sub-sample the observations, and the performance measures that quantify how well the classification model performed on that particular sub-sample. Parameters ---------- X : numpy array 2D feature matrix (rows=observations, cols=features) y : numpy array 1D labels array model : class that implements fit, and predict methods Initialized binary classification model to_remove : list or tuple, optional, default (0, 1, 2) Iterable with the number of observations to remove from sub-samples to_score : str, optional, default "mcc" Scoring function used to score each sub-sample threshold : float, optional, default 0.5 Threshold for encoding the predicted probability as a class label metrics : tuple, optional, default ("mcc", "acc", "sen", "spe") Tuple with classification metrics to compute num_folds : int, optional, default 10 Number of cross-validation folds num_repetitions : int, optional, default 20 Number of cross-validation runs seed : int, optional, default 42 Random generator seed logger : Logger, optional, default None Logger class verbose : bool, optional, default False Verbosity switch (informing about the performance on each sub-sample) Returns ------- List of dicts with the performance on each observation sub-sample Raises ------ TypeError Raised when X or y is not an instance of np.ndarray ValueError Raised when X and y have not the same number of rows (observations) """ # Prepare the logger logger = logger if logger else Logger(inspect.currentframe().f_code.co_name) # Make sure the scoring function is in the metrics if to_score not in metrics: metrics = [m for m in metrics] + [to_score] # Prepare the list of results results = [] # Cross-validate the classifier on the sub-samples of observations for i in to_remove: for selection in combinations(range(X.shape[0]), X.shape[0] - i): # Convert to the list (necessary for array slicing) selection = list(selection) # Select the subset of observations X_try = X[selection] y_try = y[selection] # Cross-validate the classifier cv = cross_validate_classifier(X_try, y_try, model, threshold=threshold, metrics=metrics, num_folds=num_folds, num_repetitions=num_repetitions, seed=seed, logger=logger) # Compute the scores (mean +- std of all cv metrics) scores = [(m, np.mean(cv.get(m)), np.std(cv.get(m))) for m in metrics] result = round(float(np.mean(cv.get(to_score))), 4) # Append to the results results.append({"score": result, "metrics": scores, "selection": selection, "removed": i}) # Log the performance if verbose: logger.info("Skipping {} observation(s), {} = {}".format(i, to_score, result)) return results
def __init__(self): self.parser = BeautifulSoupWebContentParser() self.logger = Logger(self.__class__.__name__)