def train_model(urls):
    logger.info('Start train_model...')
    logger.info('Num of train urls: %s' % len(urls))
    result = {}
    # config
    tokenizer = GeneralTokenizer().tokenize
    min_ngram = 1
    max_ngram = 2

    # train
    mg_client = get_mg_client()
    storage = mg_client.web.page
    content_getter_with_storage = ContentGetter(
        PageCrawlerWithStorage(storage), s_extractor)
    modeler = WebPageTypeModeler(urls, content_getter_with_storage,
                                 path.join(model_loc_dir, model_name),
                                 tokenizer, min_ngram, max_ngram)
    ok, msg = modeler.train()
    mg_client.close()

    if not ok:
        result['error'] = True
        result['message'] = msg
        return result

    result[
        'message'] = 'The new model name %s was trained successfully' % model_name
    result['model_name'] = model_name
    result['data'] = msg
    logger.info('End train_model...')
    return result
Example #2
0
    def delete(self):
        """Get list labeled data"""
        result = {
            'error': False,
            'message': '',
        }
        urls = request.values.get('urls', '')
        page_types = request.values.get('type', '')

        page_types = [t.strip().lower()
                      for t in page_types.split(',')] if page_types else []
        urls = [u.strip().lower() for u in urls.split(',')
                if u] if urls else []
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        mg_client = get_mg_client()
        storage = mg_client.web.page
        web_page_type = WebPageType(storage)
        deleted_count = web_page_type.delete(page_types, urls)
        result['message'] = '%s urls was deleted' % deleted_count
        mg_client.close()
        return result
Example #3
0
    def post(self):
        """Post test set urls and model name for evaluation"""
        result = {'error': False}
        model_name = request.values.get('model_name', '')
        urls = request.values.get('urls', '')
        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result
        list_model = get_list_model()
        if not model_name or model_name not in list_model:
            result['error'] = True
            result[
                'message'] = 'Model name is invalid, please select one of below models'
            result['models'] = list_model
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        unlabeled_data = check_unlabeled_data(urls)
        if unlabeled_data:
            result['error'] = True
            result[
                'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join(
                    unlabeled_data)
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result['error'] = True
            result[
                'message'] = "The extractor name '%s' does not support yet" % extractor_name
            return result

        mg_client = get_mg_client()
        storage = mg_client.web.page
        s_crawler = PageCrawlerWithStorage(storage)
        s_content_getter = ContentGetter(crawler=s_crawler,
                                         extractor=s_extractor)
        s_classifier = PredictWebPageType(model_loc_dir,
                                          model_name,
                                          s_content_getter,
                                          evaluate_mode=True)
        if classifier.get_current_model() != model_name:
            s_classifier.web_page_type_classifier = None
        else:
            s_classifier.web_page_type_classifier = classifier.web_page_type_classifier
            s_classifier.labels = classifier.web_page_type_classifier.named_steps[
                'clf'].classes_

        evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier)
        result.update(evaluation.evaluate())
        result['model_name'] = model_name
        mg_client.close()
        return result
Example #4
0
    def post(self):
        """Post labeled data to update"""
        result = {'error': False, 'message': ''}
        urls = request.values.get('urls', '')
        page_type = request.values.get('type', '')

        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result

        if not page_type:
            result['error'] = True
            result['message'] = 'The web page type is empty'
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        mg_client = get_mg_client()
        storage = mg_client.web.page
        web_page_type = WebPageType(storage)
        updated_count = web_page_type.update(urls, page_type)
        mg_client.close()
        result[
            'message'] = '%s urls has been updated label successful' % updated_count
        return result
def evaluate_model(urls):
    logger.info('Start evaluate_model...')
    logger.info('Num of test urls: %s' % len(urls))
    result = {'error': False}
    mg_client = get_mg_client()
    storage = mg_client.web.page
    s_crawler = PageCrawlerWithStorage(storage)
    s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor)
    s_classifier = PredictWebPageType(model_loc_dir,
                                      model_name,
                                      s_content_getter,
                                      evaluate_mode=True)

    evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier)
    result.update(evaluation.evaluate())
    result['model_name'] = model_name
    mg_client.close()
    logger.info('End evaluate_model...')
    return result
Example #6
0
    def get(self):
        """Get list labeled data"""
        result = {'error': False, 'message': '', 'pages': []}
        urls = request.values.get('urls', '')
        page_types = request.values.get('type', '')
        limit = request.values.get('limit', '50')
        offset = request.values.get('offset', '0')

        try:
            limit = int(limit)
        except ValueError as ex:
            result['error'] = True
            result['message'] = 'limit must be in integer'

        try:
            offset = int(offset)
        except ValueError as ex:
            result['error'] = True
            result['message'] = 'offset must be in integer'

        page_types = [t.strip().lower()
                      for t in page_types.split(',')] if page_types else []
        urls = [u.strip().lower() for u in urls.split(',')
                if u] if urls else []
        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        mg_client = get_mg_client()
        storage = mg_client.web.page
        web_page_type = WebPageType(storage)
        pages, type_count, total = web_page_type.search(
            page_types, urls, limit, offset)
        mg_client.close()
        result['pages'] = pages
        result['type_count'] = type_count
        result['total'] = total
        return result
Example #7
0
    def post(self):
        """Post urls for crawling and save to database"""
        result = {'error': False, 'message': ''}
        urls = request.values.get('urls', '')

        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        mg_client = get_mg_client()
        storage = mg_client.web.page
        s_crawler = PageCrawlerWithStorage(storage)
        pages = s_crawler.process(urls)
        mg_client.close()
        result['message'] = '%s was crawled successfully' % len(pages)
        return result
def get_training_urls(item_num_each_label):
    logger.info('Start get_training_urls...')
    result = []
    mg_client = get_mg_client()
    db = mg_client.web.page
    agg_pipeline = [{
        '$match': {
            'type': {
                '$in': ['ecommerce', 'news/blog']
            }
        }
    }, {
        '$group': {
            '_id': '$type',
            'urls': {
                '$push': '$_id'
            }
        }
    }]
    agg_type_urls = {}
    for a in db.aggregate(agg_pipeline):
        agg_type_urls[a['_id']] = a['urls']

    for page_type, urls in agg_type_urls.items():
        logger.info('Urls for type %s, having total: %s' %
                    (page_type, len(urls)))
        s_urls = random.sample(
            urls, item_num_each_label
            if len(urls) > item_num_each_label else len(urls))
        logger.info('Training data for type %s: %s urls' %
                    (page_type, len(s_urls)))
        result += s_urls

    mg_client.close()
    logger.info('Total training urls: %s' % len(result))
    logger.info('End get_training_urls...')
    return result
Example #9
0
 def test_crawler(self):
     mg_client = get_mg_client()
     storage = mg_client.web.page
     crawler = PageCrawlerWithStorage(storage)
     res = crawler.process(self.urls)
     pprint(res)
Example #10
0
def check_unlabeled_data(urls):
    mg_client = get_mg_client()
    storage = mg_client.web.page
    web_page_type = WebPageType(storage)
    mg_client.close()
    return web_page_type.check_unlabeled_data(urls)
Example #11
0
    def post(self):
        """Post web page urls to train new model"""
        result = {'error': False, 'message': ''}
        urls = request.values.get('urls', '')
        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result['error'] = True
            result[
                'message'] = "The extractor name '%s' does not support yet" % extractor_name
            return result

        model_name = request.values.get(
            'model_name',
            time.strftime(self.date_time_format) +
            '_page_type_classifier.model')
        if model_name in get_list_model():
            result['error'] = True
            result[
                'message'] = "The model name '%s' is duplicated, please select another model name." % model_name
            return result

        tokenizer_name = request.values.get('tokenizer', list_tokenizer[0])
        if not tokenizer_name:
            result['error'] = True
            result['message'] = 'Tokenizer is empty'
            return result

        tokenizer = get_tokenizer(tokenizer_name)
        if not tokenizer:
            result['error'] = True
            result['message'] = "Tokenizer name '%s' is not supported, please choose one of these tokenizer name: %s" \
                                % (tokenizer_name, ', '.join(list_tokenizer))
            return result

        min_ngram = request.values.get('min_ngram', '1')
        max_ngram = request.values.get('max_ngram', '2')

        try:
            min_ngram = int(min_ngram)
            max_ngram = int(max_ngram)
        except ValueError:
            result['error'] = True
            result['message'] = 'Max ngram and min ngram must be integer'
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        unlabeled_data = check_unlabeled_data(urls)
        if unlabeled_data:
            result['error'] = True
            result[
                'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join(
                    unlabeled_data)
            return result

        mg_client = get_mg_client()
        storage = mg_client.web.page
        content_getter_with_storage = ContentGetter(
            PageCrawlerWithStorage(storage), s_extractor)
        modeler = WebPageTypeModeler(urls, content_getter_with_storage,
                                     path.join(model_loc_dir, model_name),
                                     tokenizer, min_ngram, max_ngram)
        ok, msg = modeler.train()
        mg_client.close()
        if not ok:
            result['error'] = True
            result['message'] = msg
            return result

        result[
            'message'] = 'The new model name %s was trained successfully' % model_name
        result['model_name'] = model_name
        result['data'] = msg
        return result