Ejemplo n.º 1
0
class ProfesiaJobOffersParser(BaseJobOfferParser):
    def __init__(self):
        super().__init__()

        self.logger = Logger(self.__class__.__name__)
        self.weburl = 'https://www.profesia.sk'

    def parse(self, url, verbose=False):

        # Get the parsed HTML content of the input URL
        parsed = self.content(url)

        # Get the job offers
        offers = self.offers(parsed)

        if verbose: pprint(offers)
        return offers

    def content(self, url):
        return self.parser.parse(url)

    def offers(self, parsed_html):

        # Validate the input data
        if not parsed_html:
            return None

        # Parse the HTML for job offers
        offers = parsed_html.find_all('li', class_='list-row')
        if not offers:
            self.logger.warning(
                'No <li> tags with job offers found. Returning None')
            return None

        # Parse the job offers
        result = []
        for offer in offers:
            if offer.find('h2'):
                header = offer.find('h2').find('a', href=True, text=True)
                result.append({
                    'url': '/'.join((self.weburl, header['href'])),
                    'txt': header.text,
                    'emp': offer.find('span', class_='employer'),
                    'loc': offer.find('span', class_='job-location')
                })

        return result
Ejemplo n.º 2
0
class KarieraJobOffersParser(BaseJobOfferParser):

    def __init__(self):
        super().__init__()

        self.logger = Logger(self.__class__.__name__)
        self.weburl = 'https://kariera.sk'

    def parse(self, url, verbose=False):

        # Get the parsed HTML content of the input URL
        parsed = self.content(url)

        # Get the job offers
        offers = self.offers(parsed)

        if verbose: pprint(offers)
        return offers

    def content(self, url):
        return self.parser.parse(url)

    def offers(self, parsed_html):

        # Validate the input data
        if not parsed_html:
            return None

        # Parse the HTML for job offers
        offers = parsed_html.find_all('div', class_='column2 offer-list-info')
        if not offers:
            self.logger.warning('No <div> tags with job offers found. Returning None')
            return None

        # Parse the job offers
        result = []
        for offer in offers:
            if offer.find('h2'):
                header = offer.find('h2').find('a', href=True, text=True)
                result.append(
                    {'url': header['href'],
                     'txt': header.text,
                     'emp': offer.find('a', class_='employer', href=True, text=True).text,
                     'loc': offer.find('span', class_='place').text}
                )

        return result
    def __init__(self):
        super().__init__()

        # Prepare the logger and the web URL
        self.logger = Logger(self.__class__.__name__)
        self.weburl = 'https://www.nehnutelnosti.sk'

        # Prepare the offer data extractor
        self.extractor = NehnutelnostiPropertyOffersDataExtractor()
Ejemplo n.º 4
0
class BaseWebContentParser(object):

    HTML_PARSER_TYPE = ''
    HTML_PARSER_NAME = ''

    BASE_URL_STRING = ''
    REQUEST_TIMEOUT = 10
    REQUEST_HEADERS = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset':
        'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding':
        'none',
        'Accept-Language':
        'en-US,en;q=0.8',
        'Connection':
        'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/61.0.3163.100 '
        'Safari/537.36'
    }

    def __init__(self):
        super().__init__()

        self.parser = None
        self.logger = Logger(self.__class__.__name__)

    def parse(self, url):
        return self.get_valid_parsed_object(
            self.get_valid_response(self.get_valid_url(url)))

    def get_valid_url(self, url=None):
        if not url:
            return None
        else:
            url = self.get_url(url)
            if not self.is_valid_url(url):
                self.logger.warning(
                    f'Invalid URL {url} encountered. Returning None')
                return None
            return url

    def get_valid_response(self, url=None):
        if not url:
            return None
        else:
            response = self.get_response(url)
            if not self.is_valid_response(response):
                self.logger.warning(
                    f'Status code {response.status_code}. Returning None')
                return None
            return response

    def get_valid_parsed_object(self, response):
        if not response or not self.is_valid_response(response):
            return None
        else:
            parsed_object = self.get_parsed_object(response)
            if not self.is_valid_parsed_object(parsed_object):
                self.logger.warning(
                    'HTML parser was not initialized. Returning None')
                return None
            return parsed_object

    def get_url(self, url):
        return str(url) if url else self.BASE_URL_STRING

    def get_response(self, url):
        return requests.get(url,
                            headers=self.REQUEST_HEADERS,
                            timeout=self.REQUEST_TIMEOUT)

    def get_parsed_object(self, response):
        return self.parser(response.content,
                           self.HTML_PARSER_TYPE) if self.parser else None

    @staticmethod
    def is_valid_url(url):
        return url and (validators.url(url) is True)

    @staticmethod
    def is_valid_response(response):
        return response.status_code == requests.codes.ok

    @staticmethod
    def is_valid_parsed_object(parsed_object):
        return parsed_object if parsed_object else False
Ejemplo n.º 5
0
    def __init__(self):
        super().__init__()

        self.parser = None
        self.logger = Logger(self.__class__.__name__)
Ejemplo n.º 6
0
    def __init__(self):
        super().__init__()

        self.parser = BeautifulSoup
        self.logger = Logger(self.__class__.__name__)
Ejemplo n.º 7
0
def cross_validate_classifier(X,
                              y,
                              model,
                              threshold=0.5,
                              metrics=("mcc", "acc", "sen", "spe"),
                              num_folds=10,
                              num_repetitions=20,
                              seed=42,
                              logger=None):
    """
    Cross-validate the binary classification model

    This function cross-validates the input classification model using X, y. The
    cross-validation is set by num_folds and num_repetitions. After the model is
    cross-validated, several metrics are computed.

    Parameters
    ----------

    X : numpy array
        2D feature matrix (rows=observations, cols=features)

    y : numpy array
        1D labels array

    model : class that implements fit, and predict methods
        Initialized binary classification model

    threshold : float, optional, default 0.5
        Threshold for encoding the predicted probability as a class label

    metrics : tuple, optional, default ("mcc", "acc", "sen", "spe")
        Tuple with classification metrics to compute

    num_folds : int, optional, default 10
        Number of cross-validation folds

    num_repetitions : int, optional, default 20
        Number of cross-validation runs

    seed : int, optional, default 42
        Random generator seed

    logger : Logger, optional, default None
        Logger class

    Returns
    -------

    Default dictionary with keys=metric names, vals=metric arrays

    Raises
    ------

    TypeError
        Raised when X or y is not an instance of np.ndarray

    ValueError
        Raised when X and y have not the same number of rows (observations)
    """

    # Prepare the logger
    logger = logger if logger else Logger(inspect.currentframe().f_code.co_name)

    # Prepare the results table for the cross-validation results
    table_cv_data = defaultdict(list)

    # Run the desired number of cross-validation repetitions
    for repetition in range(num_repetitions):
        for train_i, test_i in StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True).split(X, y):

            # Split the data to train, test sets
            X_train, X_test = X[train_i], X[test_i]
            y_train, y_test = y[train_i], y[test_i]

            try:

                # fit the classifier
                model.fit(X_train, y_train)

                # Evaluate the classifier
                predicted = model.predict(X_test)

                # Encode the labels
                y_true = np.array(y_test, dtype=np.int16)
                y_pred = np.array([0 if y_hat < threshold else 1 for y_hat in predicted], dtype=np.int16)

                # Compute the classification metrics
                for metric in metrics:
                    computed = classification_metric(metric, y_true, y_pred)
                    computed = computed if computed and np.isfinite(computed) else None
                    if computed:
                        table_cv_data[metric].append(computed)

            except Exception as e:
                if "Input contains NaN, infinity or a value too large" in str(e):
                    logger.warning("Poor performance detected, skipping current validation fold")
                    continue
                else:
                    logger.exception(e)

    return table_cv_data
Ejemplo n.º 8
0
    def __init__(self):
        super().__init__()

        self.logger = Logger(self.__class__.__name__)
        self.weburl = 'https://kariera.sk'
Ejemplo n.º 9
0
    def __init__(self):
        super().__init__()

        self.logger = Logger(self.__class__.__name__)
        self.weburl = 'https://www.profesia.sk'
Ejemplo n.º 10
0
def cross_validate_regressor(X,
                             y,
                             model,
                             metrics=("mae", "mse", "rmse", "eer"),
                             num_folds=10,
                             num_repetitions=20,
                             seed=42,
                             logger=None):
    """
    Cross-validate the regression model

    This function cross-validates the input regression model using X, y. The
    cross-validation is set by num_folds and num_repetitions. After the model
    is cross-validated, several metrics are computed.

    Parameters
    ----------

    X : numpy array
        2D feature matrix (rows=observations, cols=features)

    y : numpy array
        1D labels array

    model : class that implements fit, and predict methods
        Initialized regression model

    metrics : tuple, optional, default ("mae", "mse", "rmse", "eer")
        Tuple with regression metrics to compute

    num_folds : int, optional, default 10
        Number of cross-validation folds

    num_repetitions : int, optional, default 20
        Number of cross-validation runs

    seed : int, optional, default 42
        Random generator seed

    logger : Logger, optional, default None
        Logger class

    Returns
    -------

    Default dictionary with keys=metric names, vals=metric arrays

    Raises
    ------

    TypeError
        Raised when X or y is not an instance of np.ndarray

    ValueError
        Raised when X and y have not the same number of rows (observations)
    """

    # Prepare the logger
    logger = logger if logger else Logger(
        inspect.currentframe().f_code.co_name)

    # Prepare the results table for the cross-validation results
    table_cv_data = defaultdict(list)

    # Run the desired number of cross-validation repetitions
    for repetition in range(num_repetitions):

        # Prepare accumulator for predictions
        y_pred_accumulator = np.zeros(y.shape)

        # Permute the data prior to processing
        p = np.random.permutation(X.shape[0])

        X = X[p]
        y = y[p]

        for train_i, test_i in KFold(n_splits=num_folds,
                                     random_state=seed,
                                     shuffle=True).split(X):

            # Split the data to train, test sets
            X_train, X_test = X[train_i], X[test_i]
            y_train, y_test = y[train_i], y[test_i]

            try:

                # fit the regressor
                model.fit(X_train, y_train)

                # Evaluate the regressor
                y_pred_accumulator[test_i] = model.predict(X_test)

            except Exception as e:
                if "Input contains NaN, infinity or a value too large" in str(
                        e):
                    logger.warning(
                        "Poor performance detected, skipping current validation fold"
                    )
                    continue
                else:
                    logger.exception(e)

        # Encode the labels
        y_true = np.array(y, dtype=np.int16)
        y_pred = np.array(y_pred_accumulator, dtype=np.int16)

        # Compute the regression metrics
        for metric in metrics:
            computed = regression_metric(metric, y_true, y_pred,
                                         1 if X.ndim == 1 else X.shape[1])
            computed = computed if computed and np.isfinite(computed) else None
            if computed:
                table_cv_data[metric].append(computed)

    return table_cv_data
Ejemplo n.º 11
0
def search_over_observations(X,
                             y,
                             model,
                             to_remove=(0, 1, 2),
                             to_score="mcc",
                             threshold=0.5,
                             metrics=("mcc", "acc", "sen", "spe"),
                             num_folds=10,
                             num_repetitions=20,
                             seed=42,
                             logger=None,
                             verbose=False):
    """
    Search for the best sub-set of observations

    This function searches for the sub-sample of observations that yield the best
    classification performance by iteratively sub-sampling combinations from the
    input feature matrix <X> according to <to_remove>. If for instance it is set
    to (0, 1, 2, ...), the function:
     - uses all observations
     - uses all combinations of S.shape[0] - 1 observations
     - uses all combinations of S.shape[0] - 2 observations
     - etc.

    It returns a list of dicts. In each dict, it holds the exact indices used to
    sub-sample the observations, and the performance measures that quantify how
    well the classification model performed on that particular sub-sample.

    Parameters
    ----------

    X : numpy array
        2D feature matrix (rows=observations, cols=features)

    y : numpy array
        1D labels array

    model : class that implements fit, and predict methods
        Initialized binary classification model

    to_remove : list or tuple, optional, default (0, 1, 2)
        Iterable with the number of observations to remove from sub-samples

    to_score : str, optional, default "mcc"
        Scoring function used to score each sub-sample

    threshold : float, optional, default 0.5
        Threshold for encoding the predicted probability as a class label

    metrics : tuple, optional, default ("mcc", "acc", "sen", "spe")
        Tuple with classification metrics to compute

    num_folds : int, optional, default 10
        Number of cross-validation folds

    num_repetitions : int, optional, default 20
        Number of cross-validation runs

    seed : int, optional, default 42
        Random generator seed

    logger : Logger, optional, default None
        Logger class

    verbose : bool, optional, default False
        Verbosity switch (informing about the performance on each sub-sample)

    Returns
    -------

    List of dicts with the performance on each observation sub-sample

    Raises
    ------

    TypeError
        Raised when X or y is not an instance of np.ndarray

    ValueError
        Raised when X and y have not the same number of rows (observations)
    """

    # Prepare the logger
    logger = logger if logger else Logger(inspect.currentframe().f_code.co_name)

    # Make sure the scoring function is in the metrics
    if to_score not in metrics:
        metrics = [m for m in metrics] + [to_score]

    # Prepare the list of results
    results = []

    # Cross-validate the classifier on the sub-samples of observations
    for i in to_remove:
        for selection in combinations(range(X.shape[0]), X.shape[0] - i):

            # Convert to the list (necessary for array slicing)
            selection = list(selection)

            # Select the subset of observations
            X_try = X[selection]
            y_try = y[selection]

            # Cross-validate the classifier
            cv = cross_validate_classifier(X_try,
                                           y_try,
                                           model,
                                           threshold=threshold,
                                           metrics=metrics,
                                           num_folds=num_folds,
                                           num_repetitions=num_repetitions,
                                           seed=seed,
                                           logger=logger)

            # Compute the scores (mean +- std of all cv metrics)
            scores = [(m, np.mean(cv.get(m)), np.std(cv.get(m))) for m in metrics]
            result = round(float(np.mean(cv.get(to_score))), 4)

            # Append to the results
            results.append({"score": result, "metrics": scores, "selection": selection, "removed": i})

            # Log the performance
            if verbose:
                logger.info("Skipping {} observation(s), {} = {}".format(i, to_score, result))

    return results
Ejemplo n.º 12
0
 def __init__(self):
     self.parser = BeautifulSoupWebContentParser()
     self.logger = Logger(self.__class__.__name__)