def post(self): """Post urls for extracting content (note: do not save the result)""" result = {'error': False, 'message': ''} urls = request.values.get('urls', '') urls = [u.strip().lower() for u in urls.split(',') if u] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result['error'] = True result[ 'message'] = "The extractor name '%s' does not support yet" % extractor_name return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url s_crawler = PageCrawler() s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor) result['pages'] = s_content_getter.process(urls) return result
def __init__(self, *args, **kwargs): super(MyTestCase, self).__init__(*args, **kwargs) self.main_url = "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/" self.sub_urls = [ "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/" ] self.urls = self.sub_urls + [self.main_url] self.crawler = PageCrawler() self.extractor = DragnetPageExtractor() self.content_getter = ContentGetter(self.crawler, self.extractor) self.es_client = Elasticsearch()
def train_model(urls): logger.info('Start train_model...') logger.info('Num of train urls: %s' % len(urls)) result = {} # config tokenizer = GeneralTokenizer().tokenize min_ngram = 1 max_ngram = 2 # train mg_client = get_mg_client() storage = mg_client.web.page content_getter_with_storage = ContentGetter( PageCrawlerWithStorage(storage), s_extractor) modeler = WebPageTypeModeler(urls, content_getter_with_storage, path.join(model_loc_dir, model_name), tokenizer, min_ngram, max_ngram) ok, msg = modeler.train() mg_client.close() if not ok: result['error'] = True result['message'] = msg return result result[ 'message'] = 'The new model name %s was trained successfully' % model_name result['model_name'] = model_name result['data'] = msg logger.info('End train_model...') return result
def post(self): """Post test set urls and model name for evaluation""" result = {'error': False} model_name = request.values.get('model_name', '') urls = request.values.get('urls', '') urls = [u.strip().lower() for u in urls.split(',') if u] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result list_model = get_list_model() if not model_name or model_name not in list_model: result['error'] = True result[ 'message'] = 'Model name is invalid, please select one of below models' result['models'] = list_model return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url unlabeled_data = check_unlabeled_data(urls) if unlabeled_data: result['error'] = True result[ 'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join( unlabeled_data) return result extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result['error'] = True result[ 'message'] = "The extractor name '%s' does not support yet" % extractor_name return result mg_client = get_mg_client() storage = mg_client.web.page s_crawler = PageCrawlerWithStorage(storage) s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor) s_classifier = PredictWebPageType(model_loc_dir, model_name, s_content_getter, evaluate_mode=True) if classifier.get_current_model() != model_name: s_classifier.web_page_type_classifier = None else: s_classifier.web_page_type_classifier = classifier.web_page_type_classifier s_classifier.labels = classifier.web_page_type_classifier.named_steps[ 'clf'].classes_ evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier) result.update(evaluation.evaluate()) result['model_name'] = model_name mg_client.close() return result
def crawl_pages(input_file, output_file): logger.info('Start processing input %s...' % input_file) with open(input_file, 'r') as f: list_url = [ re.sub(r'\n', '', u.strip()) for u in random.sample(f.readlines(), 1000) ] page_crawler = PageCrawler() page_extractor = DragnetPageExtractor() content_getter = ContentGetter(page_crawler, page_extractor) result = content_getter.process(list_url) with open(output_file, 'w') as f: data = json.dumps(result, f).encode('utf-8', errors='ignore') f.write(data) logger.info('End processing input %s...' % input_file)
def evaluate_model(urls): logger.info('Start evaluate_model...') logger.info('Num of test urls: %s' % len(urls)) result = {'error': False} mg_client = get_mg_client() storage = mg_client.web.page s_crawler = PageCrawlerWithStorage(storage) s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor) s_classifier = PredictWebPageType(model_loc_dir, model_name, s_content_getter, evaluate_mode=True) evaluation = WebPageTypeModelEvaluation(urls, storage, s_classifier) result.update(evaluation.evaluate()) result['model_name'] = model_name mg_client.close() logger.info('End evaluate_model...') return result
from parser.content_getter import ContentGetter from parser.crawler import PageCrawler from parser.extractor import GooseDragnetPageExtractor from pprint import pprint FIELD_KEYWORD = 'Keyword' FIELD_URL = 'Landing Page' FIELD_URL_PAGE_CONTENT = 'Landing Page Content' FIELD_URL_CRAWL_STATUS = 'Crawl Status' FIELD_URL_TYPE = 'Url Type' URL_TYPE_WEB = 'Web' URL_TYPE_NEWS = 'News' crawler = PageCrawler() extractor = GooseDragnetPageExtractor() content_getter = ContentGetter(crawler=crawler, extractor=extractor) if __name__ == '__main__': url_file = 'data/top_10_ranking_keywords.xlsx' df = pd.read_excel(url_file) urls = set() for idx, row in df.iterrows(): url = row[FIELD_URL] urls.add(url) # if idx == 5: # break url_page_contents = content_getter.process(urls) for idx, row in df.iterrows(): url = row[FIELD_URL] crawled_page = url_page_contents.get(url)
class MyTestCase(unittest.TestCase): def __init__(self, *args, **kwargs): super(MyTestCase, self).__init__(*args, **kwargs) self.main_url = "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/" self.sub_urls = [ "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/" ] self.urls = self.sub_urls + [self.main_url] self.crawler = PageCrawler() self.extractor = DragnetPageExtractor() self.content_getter = ContentGetter(self.crawler, self.extractor) self.es_client = Elasticsearch() def test_crawler(self): result = self.crawler.process(self.urls) pprint(result) def test_extractor(self): pprint(self.extractor.process(self.crawler.process(self.urls))) def test_all_text_extractor(self): self.extractor = AllTextPageExtractor() pprint( self.extractor.process( self.crawler.process([ 'https://www.uncommongoods.com/gifts/personalized/personalized-gifts' ]))) def test_all_text_extractor2(self): self.extractor = AllTextPageExtractor() pprint( self.extractor.process( self.crawler.process([ 'https://vnexpress.net/tin-tuc/thoi-su/8-nguoi-chet-hon-tram-nghin-ngoi-nha-bi-toc-mai-do-bao-doksuri-3642317.html' ]))) def test_get_text_from_url(self): urls = [ 'https://www.uncommongoods.com/gifts/personalized/personalized-gifts', 'https://stackoverflow.com/questions/1521592/get-root-domain-of-link', 'https://docs.python.org/2/library/urlparse.html' ] for url in urls: print get_text_from_url(url) def test_content_getter(self): result = self.content_getter.process(self.urls) pprint(result) def test_cosine_similarity(self): similarity = CosineSimilarity(self.content_getter, self.es_client) result = similarity.process(self.main_url, self.sub_urls) pprint(result) def _call_api(self, i): params = { 'distance_metric': 'cosine', 'main_url': self.main_url, 'sub_urls': ', '.join(self.sub_urls) } response = requests.post( 'http://107.170.109.238:8888/similarity/check', data=params) print i def test_api(self): params = { 'distance_metric': 'cosine', 'main_url': self.main_url, 'sub_urls': ', '.join(self.sub_urls) } pool = Pool(4) pool.map(self._call_api, range(2000)) def test_similarity_function(self): from similarity_checker import cosine_similarity, jaccard_similarity, fuzzy_similarity, simhash_similarity tokens_1 = 'This is a foo ba'.split() tokens_2 = 'This sentence is similar to a foo bar sentence'.split() pprint('jaccard: %s' % jaccard_similarity(tokens_1, tokens_2)) pprint('cosine: %s' % cosine_similarity(tokens_1, tokens_2)) pprint('fuzzy: %s' % fuzzy_similarity(tokens_1, tokens_2)) pprint('simhash: %s' % simhash_similarity(tokens_1, tokens_2)) def test_tokenizer(self): from similarity_checker import tokenize_and_normalize_content url = 'https://www.travelocity.com/Las-Vegas-Hotels-MGM-Grand-Hotel-Casino.h12628.Hotel-Information' page = self.content_getter.process([url]) pprint(tokenize_and_normalize_content(page[url]['content'])) def test_tokenize_and_normalize(self): from similarity_checker import tokenize_and_normalize_content text = 'what are you doing' pprint( tokenize_and_normalize_content(text, unit='character', min_ngram=1, max_ngram=3))
def post(self): """Post web pages to extract content""" result = {'error': False, 'pages': []} unit = request.values.get('unit', 'word') min_ngram = int(request.values.get('min_ngram', 1)) max_ngram = int(request.values.get('max_ngram', 1)) urls = request.values.get('urls', '') strip_chars = ' "\'' urls = [ u.strip(strip_chars) for u in urls.split(',') if u.strip(strip_chars) ] if not urls: result['error'] = 'urls must not be empty' extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result[ 'error'] = "The extractor name '%s' does not support yet" % extractor_name return result if extractor_name == 'selective': s_extractor.selector_type = request.values.get( 'selector_type', list_extractor[0]) selector = request.values.get('selector') if not selector or not selector.strip(): result[ 'error'] = "You must specify the 'selector' element when the 'extractor' is 'selective'" return result s_extractor.selector = selector.strip() user_agent = request.values.get('user_agent', user_agents[0]) page_load_timeout = request.values.get('page_load_timeout', page_load_timeout_default) wait_after_last_request = request.values.get( 'wait_after_last_request', wait_after_last_request_default) s_crawler = PageCrawler( user_agent=user_agent.strip(), page_load_timeout=page_load_timeout, wait_after_last_request=wait_after_last_request) cache = int(request.values.get('cache', 0)) if cache != 0: expire_time = int(request.values.get('expire_time', 604800)) # Seconds = 7 days s_crawler.active_redis_cache(expire_time) s_content_getter = ContentGetter(crawler=s_crawler, extractor=s_extractor) if not result['error']: pages = result['pages'] for url, page in s_content_getter.process(urls).items(): page['tokens'] = tokenize_and_normalize_content( page['content'], unit=unit, min_ngram=min_ngram, max_ngram=max_ngram) pages.append((url, page)) return jsonify(result)
def post(self): """Post web pages to check similarity percentage""" result = {'error': False, 'similarity': []} # get request params unit = request.values.get('unit', 'word') min_ngram = int(request.values.get('min_ngram', 1)) max_ngram = int(request.values.get('max_ngram', 1)) similarity_checker.unit = unit similarity_checker.min_ngram = min_ngram similarity_checker.max_ngram = max_ngram distance_metric = request.values.get('distance_metric', '') if not distance_metric: similarity_checker.similarity = cosine_similarity elif distance_metric not in distance_metrics: result['error'] = 'distance_metric must be in %s' % ', '.join( distance_metrics) return result elif distance_metric == 'jaccard': similarity_checker.similarity = jaccard_similarity elif distance_metric == 'cosine': similarity_checker.similarity = cosine_similarity elif distance_metric == 'fuzzy': similarity_checker.similarity = fuzzy_similarity elif distance_metric == 'simhash': similarity_checker.similarity = simhash_similarity url_1 = request.values.get('url_1', '') url_2 = request.values.get('url_2', '') url_3 = request.values.get('url_3', '') if not url_1: result['error'] = 'url_1 must not blank' return result if not url_2: result['error'] = 'url_2 must not blank' return result if not url_3: result['error'] = 'url_3 must not blank' return result extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result[ 'error'] = "The extractor name '%s' does not support yet" % extractor_name return result url_1_selector = None url_2_selector = None url_3_selector = None if extractor_name == 'selective': s_extractor.selector_type = request.values.get( 'selector_type', list_extractor[0]) url_1_selector = request.values.get('url_1_selector') url_2_selector = request.values.get('url_2_selector') url_3_selector = request.values.get('url_3_selector') if not url_1_selector or not url_1_selector.strip(): result['error'] = "You must specify the 'url_1_selector' element when the 'extractor' " \ "is 'selective'" return result if not url_2_selector or not url_2_selector.strip(): result[ 'error'] = "You must specify the 'url_2_selector' element when the 'extractor' is 'selective'" return result if not url_3_selector or not url_3_selector.strip(): result[ 'error'] = "You must specify the 'url_3_selector' element when the 'extractor' is 'selective'" return result user_agent = request.values.get('user_agent', user_agents[0]) page_load_timeout = request.values.get('page_load_timeout', page_load_timeout_default) wait_after_last_request = request.values.get( 'wait_after_last_request', wait_after_last_request_default) s_content_getter = ContentGetter(crawler=PageCrawler( user_agent=user_agent.strip(), page_load_timeout=page_load_timeout, wait_after_last_request=wait_after_last_request), extractor=s_extractor) # check similarity if not result['error']: similarity_checker.content_getter = s_content_getter similarity_checker.url_1_selector = url_1_selector similarity_checker.url_2_selector = url_2_selector similarity_checker.url_3_selector = url_3_selector sims = similarity_checker.cross_process(url_1, url_2, url_3) if sims: result['similarity'] = sims return jsonify(result)
def post(self): """Post web pages to check similarity percentage""" result = {'error': False, 'similarity': []} # get request params unit = request.values.get('unit', 'word') min_ngram = int(request.values.get('min_ngram', 1)) max_ngram = int(request.values.get('max_ngram', 1)) similarity_checker.unit = unit similarity_checker.min_ngram = min_ngram similarity_checker.max_ngram = max_ngram distance_metric = request.values.get('distance_metric', '') if not distance_metric: similarity_checker.similarity = cosine_similarity elif distance_metric not in distance_metrics: result['error'] = 'distance_metric must be in %s' % ', '.join( distance_metrics) return result elif distance_metric == 'jaccard': similarity_checker.similarity = jaccard_similarity elif distance_metric == 'cosine': similarity_checker.similarity = cosine_similarity elif distance_metric == 'fuzzy': similarity_checker.similarity = fuzzy_similarity elif distance_metric == 'simhash': similarity_checker.similarity = simhash_similarity main_url = request.values.get('main_url', '') sub_url_string = request.values.get('sub_urls', '') strip_chars = ' "\'' sub_urls = [ u.strip(strip_chars) for u in sub_url_string.split(',') if u.strip(strip_chars) ] if not main_url: result['error'] = 'main_url must not blank' return result if not sub_urls: result['error'] = 'sub_urls must not blank' return result # validate params type if type(sub_urls) is not list: result['error'] = 'sub_urls must be in array type' return result extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result[ 'error'] = "The extractor name '%s' does not support yet" % extractor_name return result main_page_selector = None sub_page_selector = None if extractor_name == 'selective': s_extractor.selector_type = request.values.get( 'selector_type', list_extractor[0]) main_page_selector = request.values.get('main_page_selector') sub_page_selector = request.values.get('sub_page_selector') if not main_page_selector or not main_page_selector.strip(): result['error'] = "You must specify the 'main_page_selector' element when the 'extractor' " \ "is 'selective'" return result if not sub_page_selector or not sub_page_selector.strip(): result[ 'error'] = "You must specify the 'sub_page_selector' element when the 'extractor' is 'selective'" return result user_agent = request.values.get('user_agent', user_agents[0]) page_load_timeout = request.values.get('page_load_timeout', page_load_timeout_default) wait_after_last_request = request.values.get( 'wait_after_last_request', wait_after_last_request_default) s_content_getter = ContentGetter(crawler=PageCrawler( user_agent=user_agent.strip(), page_load_timeout=page_load_timeout, wait_after_last_request=wait_after_last_request), extractor=s_extractor) # check similarity if not result['error']: similarity_checker.content_getter = s_content_getter if main_page_selector: similarity_checker.main_page_selector = main_page_selector.strip( ) similarity_checker.sub_page_selector = sub_page_selector.strip( ) sims = similarity_checker.process(main_url=main_url, sub_urls=sub_urls) if sims: result['similarity'] = sims else: result['error'] = 'Main page is empty' return jsonify(result)
from flask import request, jsonify from flask_restplus import Api, Resource, fields from app import app from parser.content_getter import ContentGetter from parser.crawler_cluster import PageCrawlerCluster as PageCrawler from parser.extractor import DragnetPageExtractor, ReadabilityPageExtractor, GoosePageExtractor, \ GooseDragnetPageExtractor, SelectivePageExtractor, AllTextPageExtractor from similarity_checker import SimilarityChecker, jaccard_similarity, cosine_similarity, \ fuzzy_similarity, simhash_similarity, tokenize_and_normalize_content api = Api(app, doc='/doc/', version='1.0', title='Web pages similarity') crawler = PageCrawler() extractor = DragnetPageExtractor() content_getter = ContentGetter(crawler=crawler, extractor=extractor) similarity_checker = SimilarityChecker(content_getter=content_getter, similarity=cosine_similarity) list_extractor = [ 'dragnet', 'goose', 'goose_dragnet', 'readability', 'selective', 'all_text' ] def get_extractor(name): if name == 'dragnet': return DragnetPageExtractor() elif name == 'readability': return ReadabilityPageExtractor() elif name == 'goose': return GoosePageExtractor()
def post(self): """Post web page urls to train new model""" result = {'error': False, 'message': ''} urls = request.values.get('urls', '') urls = [u.strip().lower() for u in urls.split(',') if u] if not urls: result['error'] = True result['message'] = 'Urls is empty' return result extractor_name = request.values.get('extractor', list_extractor[0]) s_extractor = get_extractor(extractor_name) if not extractor: result['error'] = True result[ 'message'] = "The extractor name '%s' does not support yet" % extractor_name return result model_name = request.values.get( 'model_name', time.strftime(self.date_time_format) + '_page_type_classifier.model') if model_name in get_list_model(): result['error'] = True result[ 'message'] = "The model name '%s' is duplicated, please select another model name." % model_name return result tokenizer_name = request.values.get('tokenizer', list_tokenizer[0]) if not tokenizer_name: result['error'] = True result['message'] = 'Tokenizer is empty' return result tokenizer = get_tokenizer(tokenizer_name) if not tokenizer: result['error'] = True result['message'] = "Tokenizer name '%s' is not supported, please choose one of these tokenizer name: %s" \ % (tokenizer_name, ', '.join(list_tokenizer)) return result min_ngram = request.values.get('min_ngram', '1') max_ngram = request.values.get('max_ngram', '2') try: min_ngram = int(min_ngram) max_ngram = int(max_ngram) except ValueError: result['error'] = True result['message'] = 'Max ngram and min ngram must be integer' return result # append urls that missing schema for idx, url in enumerate(urls): if not url.startswith('http'): urls[idx] = 'http://' + url unlabeled_data = check_unlabeled_data(urls) if unlabeled_data: result['error'] = True result[ 'message'] = 'Please label all urls firstly, unlabeled data: %s' % ', '.join( unlabeled_data) return result mg_client = get_mg_client() storage = mg_client.web.page content_getter_with_storage = ContentGetter( PageCrawlerWithStorage(storage), s_extractor) modeler = WebPageTypeModeler(urls, content_getter_with_storage, path.join(model_loc_dir, model_name), tokenizer, min_ngram, max_ngram) ok, msg = modeler.train() mg_client.close() if not ok: result['error'] = True result['message'] = msg return result result[ 'message'] = 'The new model name %s was trained successfully' % model_name result['model_name'] = model_name result['data'] = msg return result