def train_model(urls): logger.info('Start train_model...') logger.info('Num of train urls: %s' % len(urls)) result = {} # config tokenizer = GeneralTokenizer().tokenize min_ngram = 1 max_ngram = 2 # train mg_client = get_mg_client() storage = mg_client.web.page content_getter_with_storage = ContentGetter( PageCrawlerWithStorage(storage), s_extractor) modeler = WebPageTypeModeler(urls, content_getter_with_storage, path.join(model_loc_dir, model_name), tokenizer, min_ngram, max_ngram) ok, msg = modeler.train() mg_client.close() if not ok: result['error'] = True result['message'] = msg return result result[ 'message'] = 'The new model name %s was trained successfully' % model_name result['model_name'] = model_name result['data'] = msg logger.info('End train_model...') return result
def delete(self): """Get list labeled data""" result = { 'error': False, 'message': '', } urls = request.values.get('urls', '') page_types = request.values.get('type', '') page_types = [t.strip().lower() for t in page_types.split(',')] if page_types else [] urls = [u.strip().lower() for u in urls.split(',') if u] if urls else [] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url mg_client = get_mg_client() storage = mg_client.web.page web_page_type = WebPageType(storage) deleted_count = web_page_type.delete(page_types, urls) result['message'] = '%s urls was deleted' % deleted_count mg_client.close() return result
def post(self): """Post test set urls and model name for evaluation""" result = {'error': False} model_name = request.values.get('model_name', '') urls = request.values.get('urls', '') urls = [u.strip().lower() for u in urls.split(',') if u] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result list_model = get_list_model() if not model_name or model_name not in list_model: result['error'] = True result[ 'message'] = 'Model name is invalid, please select one of below models' result['models'] = list_model return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url unlabeled_data = check_unlabeled_data(urls) if unlabeled_data: result['error'] = True result[ 'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join( unlabeled_data) return result extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result['error'] = True result[ 'message'] = "The extractor name '%s' does not support yet" % extractor_name return result mg_client = get_mg_client() storage = mg_client.web.page s_crawler = PageCrawlerWithStorage(storage) s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor) s_classifier = PredictWebPageType(model_loc_dir, model_name, s_content_getter, evaluate_mode=True) if classifier.get_current_model() != model_name: s_classifier.web_page_type_classifier = None else: s_classifier.web_page_type_classifier = classifier.web_page_type_classifier s_classifier.labels = classifier.web_page_type_classifier.named_steps[ 'clf'].classes_ evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier) result.update(evaluation.evaluate()) result['model_name'] = model_name mg_client.close() return result
def post(self): """Post labeled data to update""" result = {'error': False, 'message': ''} urls = request.values.get('urls', '') page_type = request.values.get('type', '') urls = [u.strip().lower() for u in urls.split(',') if u] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result if not page_type: result['error'] = True result['message'] = 'The web page type is empty' return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url mg_client = get_mg_client() storage = mg_client.web.page web_page_type = WebPageType(storage) updated_count = web_page_type.update(urls, page_type) mg_client.close() result[ 'message'] = '%s urls has been updated label successful' % updated_count return result
def evaluate_model(urls): logger.info('Start evaluate_model...') logger.info('Num of test urls: %s' % len(urls)) result = {'error': False} mg_client = get_mg_client() storage = mg_client.web.page s_crawler = PageCrawlerWithStorage(storage) s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor) s_classifier = PredictWebPageType(model_loc_dir, model_name, s_content_getter, evaluate_mode=True) evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier) result.update(evaluation.evaluate()) result['model_name'] = model_name mg_client.close() logger.info('End evaluate_model...') return result
def get(self): """Get list labeled data""" result = {'error': False, 'message': '', 'pages': []} urls = request.values.get('urls', '') page_types = request.values.get('type', '') limit = request.values.get('limit', '50') offset = request.values.get('offset', '0') try: limit = int(limit) except ValueError as ex: result['error'] = True result['message'] = 'limit must be in integer' try: offset = int(offset) except ValueError as ex: result['error'] = True result['message'] = 'offset must be in integer' page_types = [t.strip().lower() for t in page_types.split(',')] if page_types else [] urls = [u.strip().lower() for u in urls.split(',') if u] if urls else [] # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url mg_client = get_mg_client() storage = mg_client.web.page web_page_type = WebPageType(storage) pages, type_count, total = web_page_type.search( page_types, urls, limit, offset) mg_client.close() result['pages'] = pages result['type_count'] = type_count result['total'] = total return result
def post(self): """Post urls for crawling and save to database""" result = {'error': False, 'message': ''} urls = request.values.get('urls', '') urls = [u.strip().lower() for u in urls.split(',') if u] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url mg_client = get_mg_client() storage = mg_client.web.page s_crawler = PageCrawlerWithStorage(storage) pages = s_crawler.process(urls) mg_client.close() result['message'] = '%s was crawled successfully' % len(pages) return result
def get_training_urls(item_num_each_label): logger.info('Start get_training_urls...') result = [] mg_client = get_mg_client() db = mg_client.web.page agg_pipeline = [{ '$match': { 'type': { '$in': ['ecommerce', 'news/blog'] } } }, { '$group': { '_id': '$type', 'urls': { '$push': '$_id' } } }] agg_type_urls = {} for a in db.aggregate(agg_pipeline): agg_type_urls[a['_id']] = a['urls'] for page_type, urls in agg_type_urls.items(): logger.info('Urls for type %s, having total: %s' % (page_type, len(urls))) s_urls = random.sample( urls, item_num_each_label if len(urls) > item_num_each_label else len(urls)) logger.info('Training data for type %s: %s urls' % (page_type, len(s_urls))) result += s_urls mg_client.close() logger.info('Total training urls: %s' % len(result)) logger.info('End get_training_urls...') return result
def test_crawler(self): mg_client = get_mg_client() storage = mg_client.web.page crawler = PageCrawlerWithStorage(storage) res = crawler.process(self.urls) pprint(res)
def check_unlabeled_data(urls): mg_client = get_mg_client() storage = mg_client.web.page web_page_type = WebPageType(storage) mg_client.close() return web_page_type.check_unlabeled_data(urls)
def post(self): """Post web page urls to train new model""" result = {'error': False, 'message': ''} urls = request.values.get('urls', '') urls = [u.strip().lower() for u in urls.split(',') if u] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result['error'] = True result[ 'message'] = "The extractor name '%s' does not support yet" % extractor_name return result model_name = request.values.get( 'model_name', time.strftime(self.date_time_format) + '_page_type_classifier.model') if model_name in get_list_model(): result['error'] = True result[ 'message'] = "The model name '%s' is duplicated, please select another model name." % model_name return result tokenizer_name = request.values.get('tokenizer', list_tokenizer[0]) if not tokenizer_name: result['error'] = True result['message'] = 'Tokenizer is empty' return result tokenizer = get_tokenizer(tokenizer_name) if not tokenizer: result['error'] = True result['message'] = "Tokenizer name '%s' is not supported, please choose one of these tokenizer name: %s" \ % (tokenizer_name, ', '.join(list_tokenizer)) return result min_ngram = request.values.get('min_ngram', '1') max_ngram = request.values.get('max_ngram', '2') try: min_ngram = int(min_ngram) max_ngram = int(max_ngram) except ValueError: result['error'] = True result['message'] = 'Max ngram and min ngram must be integer' return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url unlabeled_data = check_unlabeled_data(urls) if unlabeled_data: result['error'] = True result[ 'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join( unlabeled_data) return result mg_client = get_mg_client() storage = mg_client.web.page content_getter_with_storage = ContentGetter( PageCrawlerWithStorage(storage), s_extractor) modeler = WebPageTypeModeler(urls, content_getter_with_storage, path.join(model_loc_dir, model_name), tokenizer, min_ngram, max_ngram) ok, msg = modeler.train() mg_client.close() if not ok: result['error'] = True result['message'] = msg return result result[ 'message'] = 'The new model name %s was trained successfully' % model_name result['model_name'] = model_name result['data'] = msg return result