def test_crawl(self): """ Tests crawl method The get_html method of the html_requester class is mocked to return the contents of html_test_data.html. This mocking allows for inputting test html data without having to host it online. """ file_util = FileUtil() expected_result = file_util.get_file_contents("crawl_test_data.txt") web_crawler = WebCrawler() web_crawler.html_requester.get_html = lambda url: self.mock_get_html( url) actual_result = web_crawler.crawl("http://www.domain.com") self.assertEqual(expected_result, actual_result)
def main(): try: global wc_dev, wc_itpro, dev_index,itpro_index, ranks wc_dev = WebCrawler() wc_itpro = WebCrawler() dev_index=wc_dev.crawl_web("dev") itpro_index=wc_itpro.crawl_web("itpro") ranks = wc_dev.compute_ranks() server = HTTPServer(('192.168.1.225', 8080), OffDocsHandler) print 'started http server...' server.serve_forever() except KeyboardInterrupt: print 'shutting down server' server.socket.close()
def __init__(self, target): super().__init__() self.crawler = WebCrawler() target_config = getConfig().get("targets", {}).get(target) self.logger = getLogger(self.__class__.__name__) if not target_config: self.logger.error("target is not found in config.") raise Exception("target is not found in config.") self.logger.info(f"Application is processing target {target}") self.target_config = target_config self.max_threads = int(getConfig()["configs"]["max_threads"]) self.sleep_time = int(self.target_config["sleep"]) self.detail_urls = [] self.items = []
def setUp(self): ''' Initial setup method.''' self.main_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\ '.com/2015_Developer_Scrape/5_products.html' self.prd_pg_url = 'http://hiring-tests.s3-website-eu-west-1.' \ 'amazonaws.com/2015_Developer_Scrape/sainsburys-apricot-' \ 'ripe---ready-320g.html' self.crawl_obj = WebCrawler() self.prod_page_html = BeautifulSoup('<div class='\ '"productTitleDescriptionContainer"><h1>Sainsbury\'s Apricot Ripe' \ ' & Ready x5</h1></div><div class="pricing"><p class=' \ '"pricePerUnit">£3.50</p></div><div class="productText">' \ '<p>Apricots</p></div>', 'lxml') self.main_page_html = BeautifulSoup('<div class="productInfo"><a' \ ' href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \ '2015_Developer_Scrape/sainsburys-apricot-ripe---ready-320g.html"' \ '>Sainsbury\'s Apricot Ripe & Ready x5</a>' \ '<div class="productTitleDescriptionContainer"><h1>Sainsbury\'s' \ ' Apricot Ripe & Ready x5</h1></div><p class="pricePerUnit">' \ '£3.50<span class="pricePerUnitUnit">unit</span></p>' \ '<div class="productText">Apricots</div></div>' \ '<div class="productInfo"><a ' \ 'href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \ '2015_Developer_Scrape/sainsburys-avocado-xl-pinkerton-loose-' \ '300g.html">Sainsbury\'s Avocado Ripe & Ready XL Loose 300g' \ '</a><div class="productTitleDescriptionContainer"><h1>' \ 'Sainsbury\'s Avocado Ripe & Ready XL Loose 300g</h1></div>' \ '<p class="pricePerUnit">£1.50<span class="pricePerUnitUnit"'\ '>unit</span></p><div class="productText">Avocados</div></div>', 'lxml') self.dummy_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\ '.com/2015_Developer_Scrape/dummy_products.html'
class WebCrawlerTests(unittest.TestCase): def setUp(self): self.crawler = WebCrawler("hackbulgaria.com") def test_prepare_link(self): url = "http://hackbulgaria.com/" href = "/courses/" self.assertEqual(self.crawler.prepare_link(url, href), "http://hackbulgaria.com/courses/") def test_is_outgoing_false(self): self.assertTrue(self.crawler.is_outgoing("http://facebook.com")) def test_is_outgoing_true(self): url = "https://hackbulgaria.com/media/content_media/" url += "JavaScript-Frontend-conspect.pdf" self.assertFalse(self.crawler.is_outgoing(url))
class SuspiciousURLCrawler: def __init__(self): self.web_crawler = WebCrawler() # Crawls domains that are typos of popular domains to scrape suspicious URLs def get_suspicious_urls(self, verbose: bool, num_urls=None) -> [str]: typos = reduce(set.union, map(generate_typos, _get_popular_domains()), set()) typos = set(map(lambda typo: PROTOCOL + typo, typos)) return self.web_crawler.scrape_links(typos, verbose, num_urls, _get_exclusion_sites())
def main(args): logging.basicConfig(filename='web_crawler_' + str(datetime.now()) + '.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) try: web_crawler_instance = WebCrawler(args[1]) web_crawler_instance.crawl() except IndexError as ex: logging.error( f"An error has occurred whilst running the crawler, no URL was provided: {str(ex)}" ) except DBConnectionError as ex: logging.error( f"An error has ocurred whilst connecting to DB: {str(ex)}") except Exception as ex: logging.error( f"An error has occurred whilst running the crawler: {str(ex)}") logging.info("Program finished running.")
def test_crawler(self): web_crawler = WebCrawler(url="http://localhost:5000", max_threads=32, max_pages=float("inf")) self.assertEqual( sorted(list(web_crawler.crawl())), sorted([ "http://localhost:5000/com", "http://localhost:5000/test", "http://localhost:5000/test123", ]), ) self.assertNotEqual( sorted(list(web_crawler.crawl())), sorted([ "https://google.com", "http://localhost:5000/com", "/test", "/test123" ]), )
def build(cls): command_print("Build started") cls.crawler = WebCrawler() # run method for index cls.crawler.scrape_index_pages() # run method for all country pages cls.crawler.scrape_country_pages() # run method for all continent pages cls.crawler.scrape_continent_pages() # create the index from memory cls.crawler.create_index_file() command_print("Build completed")
def download_imgs(): with open(os.path.join(COMMON_FLAGS.json_dir, 'selected_keywords.json'), 'r') as fp: keywords = json.load(fp) print("keywords:", type(keywords), len(keywords), type(keywords[0])) api_keys = { 'flickr': ('3845aa5608781b176e74bedd2a653b78', '19192eb5251a4809') } # replace XXX.. and YYY.. by your own keys # images_nbr = 10000 # number of images to fetch images_nbr = 200 # 200 * 200 = 40k ### Crawl and download images ### from web_crawler import WebCrawler crawler = WebCrawler(api_keys, mindate=mindate, maxdate=maxdate) # 1. Crawl the web and collect URLs: crawler.collect_links_from_web(keywords, images_nbr, remove_duplicated_links=True) # 2. (alernative to the previous line) Load URLs from a file instead of the web: #crawler.load_urls(download_folder + "/links.txt") #crawler.load_urls_from_json(download_folder + "/links.json") # 3. Save URLs to download them later (optional): # crawler.save_urls(os.path.join(download_folder, "links.txt")) crawler.save_urls_to_json( os.path.join(url_folder, "links-%s-%s.json" % (mindate, maxdate)))
class HttpBin(): def __init__(self, base_url): super().__init__() self.logger = getLogger(self.__class__.__name__) self.crawler = WebCrawler() self.base_url = base_url def request_json(self, method="get", params={}): return self.crawler.get_response_json( f"{self.base_url}/{method.lower()}", method=method, params=params, headers={})
class SearchEngine(object): ''' SearchEngine is the main class of this system. Methods: add_page() -- Add a new html files in the file system of OS. search() -- Search key word in the system, return all the url and file contain this word. Attributes: inverted_index -- The data structure maintains all the terms and the coorespending URL/File crawler -- A object that parsing html format input to raw text, and then preprocess the raw text into a array of words which can save into inverted_index. ''' def __init__(self): self.inverted_index = InvertedIndex() self.crawler = WebCrawler() def add_page(self, page): if page.startswith('http'): ## if input with http, it is a url, otherwise if a file path pass words = self.crawler.add_url(page) self.inverted_index.add(page, words) else: words = self.crawler.add_pages(page) self.inverted_index.add(page, words) print() self.inverted_index.print_trie() def search(self, word): outs = self.inverted_index.search(word) if outs: print('Searching key word is {}'.format(word)) for i,out in enumerate(outs): print('{}. {}'.format(i+1, out)) print() else: print('Can not find key {} in SearchEngine.\n'.format(word)) def print_trie(self): self.inverted_index.print_trie()
def web_crawler_main(): """ check user input and start WebCrawler """ opts, args = get_args() logger = get_logger() url = add_valid_protocol_prefix(opts.url) depth_limit = opts.depth_limit if 0 < opts.depth_limit <= DEFAULT__DEPTH_LIMIT else None time_out = opts.time_out if 0 < opts.time_out else None if not url or not depth_limit or not time_out: if not url: logger.error("invalid page address") if not depth_limit: logger.error("invalid depth limit") if not time_out: logger.error("invalid time out") raise SystemExit(1) domain_name = get_sub_domain_name(url) web_crawler = WebCrawler(url, domain_name, depth_limit, time_out, logger) web_crawler.start()
class DoubanBookComments: def __init__(self): self.logger = getLogger(self.__class__.__name__) self.commentUrl = getConfig()['task2']['comment_url'] self.web_crawler = WebCrawler() self.db_helper = DbHelper() self.__comments = [] def __process_comments(self): selector = self.web_crawler.get_parser_response(self.commentUrl, parser='lxml') commentElements = selector.xpath( "//div[@id='comments']/ul/li[@class='comment-item']") title = selector.xpath("//div[@id='content']/h1[1]/text()")[0].split( ' ')[0] for commentEle in commentElements: try: score = commentEle.xpath("div[2]/h3[1]/span[2]/span[1]/@class" )[0].split(' ')[1].replace( 'allstar', '').replace('0', '') content = commentEle.xpath( "div[2]/p[1]/span[1]//text()")[0].strip() comment = Comment(**{ 'title': title, 'content': content, 'score': score }) self.__comments.append(comment) except: self.logger.error("Invalid element, skip...") def comments(self): if not self.__comments: self.__process_comments() return self.__comments def __sentiment(self, text): return SnowNLP(text).sentiments def sentiment(self): comments = [comment.to_dict() for comment in self.comments()] df = pd.DataFrame(comments) df['sentiment'] = df['content'].apply(self.__sentiment) print(f'Average sentiment score: {df.sentiment.mean()}') def store(self): comments = self.comments() for comment in comments: self.db_helper.insert(comment) self.db_helper.close()
def main(): parser = ArgumentParser() group = parser.add_mutually_exclusive_group() group.add_argument("-d", "--depth", type=int, help="limit crawling by depth of directory tree (default, 10)") group.add_argument("-c", "--count", type=int, help="limit crawling by number of pages") parser.add_argument("url_list", help="file containing urls separated by newlines") parser.add_argument("-v", "--verbose", action="store_true", help="set verbosity of program") parser.add_argument("-p", "--max-processes", type=int, help="maximum number of processes to run in parallel (default is 10)") parser.add_argument("-t", "--max-threads", type=int, help="maximum number of threads per process (default is 20)") args = parser.parse_args() # check if url_list file exists and that user has permission to read it if not os.path.isfile(args.url_list) or not os.access(args.url_list, os.R_OK): print("[-] File does not exist: {}".format(args.url_list)) sys.exit(1) # get url list urls = list() with open(args.url_list, "r") as url_list_file: for url in url_list_file: urls.append(url.strip()) crawler = WebCrawler(urls) # set custom parameters if args.max_processes: crawler.max_processes = args.max_processes if args.max_threads: crawler.max_threads = args.max_threads if args.verbose: crawler.verbose = True if args.depth: crawler.limit = "depth" crawler.limit_param = args.depth elif args.count: crawler.limit = "count" crawler.limit_param = args.count crawler.start() sys.exit(0)
def test_get_host(self): web_crawler = WebCrawler.get_host("http://localhost:5000") self.assertEqual(web_crawler, "localhost:5000")
def setUp(self): self.crawler = WebCrawler("hackbulgaria.com")
web_crawler_test_2_wykop.py This test precisely explores www.wykop.pl in search for RSS feeds. """ import sys import time sys.path.append("../web_crawler") from web_crawler import WebCrawler sys.path.append("..") from privileges import construct_full_privilege, privileges_bigger_or_equal master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = "http://www.wykop.pl/" ) WebCrawler.create_worker ( privileges = construct_full_privilege(), master = master_crawler, max_crawling_depth = 3 ) master_crawler.run() time.sleep(60*60*24*3) master_crawler.terminate()
from configuration.config import Config from web_crawler import WebCrawler from crawler_observers.crawler_printer import CrawlerPrinter if __name__ == '__main__': WebCrawler.crawl(Config.SEED_URL, CrawlerPrinter())
class DoubanBookReviews: def __init__(self): super().__init__() self.logger = getLogger(self.__class__.__name__) self.reviewUrl = getConfig()['task1']['review_url'] self.web_crawler = WebCrawler() def get_reviews(self): selector = self.web_crawler.get_parser_response(self.reviewUrl, parser='lxml') reviewEles = selector.xpath('//div[@class=\"short-content\"]') reviews = [] for ele in reviewEles: review = ele.text.strip().replace(' ', '').replace( '\n', '').replace('\r', '').replace('(', '').replace('...', '') if review and review != '': reviews.append(review) return ''.join(reviews) def get_keywords(self): keys = [ key for key in analyse.extract_tags(self.get_reviews(), topK=getConfig()['task1'].get( 'topK', 10), withWeight=False) ] return keys def generate_word_cloud(self, fileName=None): text_string = ','.join(self.get_keywords()) wc = WordCloud(width=600, height=200, margin=2, ranks_only=None, prefer_horizontal=0.9, mask=None, color_func=None, max_words=200, stopwords=None, random_state=None, background_color='#ffffff', font_step=1, mode='RGB', regexp=None, collocations=True, normalize_plurals=True, contour_width=0, colormap='viridis', contour_color='Blues', repeat=False, scale=2, min_font_size=10, max_font_size=200, font_path=os.environ.get( 'FONT_PATH', os.path.join(os.path.dirname(__file__), 'PingFang.ttc'))) wc.generate_from_text(text_string) if fileName: ROOT_DIR = os.getcwd() path = os.path.join(ROOT_DIR, "output") if not os.path.exists(path): os.makedirs("output") wc.to_file(os.path.join(path, fileName)) else: plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.tight_layout() plt.show()
web_crawler_test_1_rss_wp.py This simple test fetches 10 RSS feeds from rss.wp.pl website and then quits. """ import sys import time sys.path.append("../web_crawler") from web_crawler import WebCrawler sys.path.append("..") from privileges import construct_full_privilege, privileges_bigger_or_equal master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = "http://rss.wp.pl/" ) WebCrawler.create_worker ( master = master_crawler, privileges = construct_full_privilege(), max_internal_expansion = 10, max_database_updates = 10 ) master_crawler.run() time.sleep(120) master_crawler.terminate()
class TestWebCrawler(unittest.TestCase): def setUp(self): self.crawler = WebCrawler( table='tbl_dfa256', url=table_url_map['tbl_dfa256'], ) def test_crawl(self): pass def test_get_new_page(self): filepath = os.path.join( '/etc/calfresh/temp', self.crawler.table + '_' + str(datetime.date.today()), ) new_page_path = self.crawler._get_new_page() self.assertEqual(new_page_path, filepath) self.crawler.url = None self.assertIsNone(self.crawler._get_new_page()) def test_get_old_page(self): filepath = os.path.join( '/etc/calfresh/temp', self.crawler.table + '_' + str(datetime.date.today() - datetime.timedelta(days=1)), ) old_page_path = self.crawler._get_old_page() self.assertEqual(old_page_path, filepath) def test_download_new_files(self): pass def test_get_filename(self): good_url1 = '/9/DSSDB/DataTables/DFA256FY17-18.xlsx?ver=2018-06-08-125617-450' good_url2 = '/9/DSSDB/DataTables/DFA256FY14-15.xls' good_filename1 = self.crawler._get_filename(good_url1) good_filename2 = self.crawler._get_filename(good_url2) self.assertEqual(good_filename1, 'DFA256FY17-18.xlsx') self.assertEqual(good_filename2, 'DFA256FY14-15.xls') def test_clean_up(self): two_days_ago = str(datetime.date.today() - datetime.timedelta(days=2)) filepath1 = os.path.join('/etc/calfresh/temp', 'junk1_' + two_days_ago) filepath2 = os.path.join('/etc/calfresh/temp', 'junk2_' + two_days_ago) with open(filepath1, 'w') as fd: fd.write('test1') with open(filepath2, 'w') as fd: fd.write('test2') for root, dirs, files in os.walk('/etc/calfresh/temp'): self.assertIn('junk1_' + two_days_ago, files) self.assertIn('junk2_' + two_days_ago, files) for file in files: if file.endswith(str(two_days_ago)): os.remove(os.path.join('/etc/calfresh/temp', file)) for root, dirs, files in os.walk('/etc/calfresh/temp'): self.assertNotIn('junk1_' + two_days_ago, files) self.assertNotIn('junk2_' + two_days_ago, files)
def setUp(self): self.crawler = WebCrawler( table='tbl_dfa256', url=table_url_map['tbl_dfa256'], )
def __init__(self, base_url): super().__init__() self.logger = getLogger(self.__class__.__name__) self.crawler = WebCrawler() self.base_url = base_url
def __init__(self): self.inverted_index = InvertedIndex() self.crawler = WebCrawler()
# -*0 coding: cp949 -*- from web_crawler import WebCrawler if __name__ == "__main__": web_crawler = WebCrawler('http://postgame.tistory.com', 'titleWrap', 'web_crawler.html') if web_crawler.proc_chkupdate() is True: print 'No update!' else: print 'New update!'
#!/usr/bin/env python keywords = ["cats", "dogs", "birds"] api_keys = {'google': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY'), 'flickr': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY')} images_nbr = 10 # number of images to fetch download_folder = "./data" # folder in which the images will be stored ### Crawl and download images ### from web_crawler import WebCrawler crawler = WebCrawler(api_keys) # 1. Crawl the web and collect URLs: crawler.collect_links_from_web(keywords, images_nbr, remove_duplicated_links=True) # 2. (alernative to the previous line) Load URLs from a file instead of the web: #crawler.load_urls(download_folder + "/links.txt") #crawler.load_urls_from_json(download_folder + "/links.json") # 3. Save URLs to download them later (optional): crawler.save_urls(download_folder + "/links.txt") #crawler.save_urls_to_json(download_folder + "/links.json") # 4. Download the images: crawler.download_images(target_folder=download_folder) ### Build the dataset ### from dataset_builder import DatasetBuilder dataset_builder = DatasetBuilder()
#!/usr/bin/env python keywords = ["cats", "dogs", "birds"] api_keys = { 'google': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY'), 'flickr': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY') } images_nbr = 10 # number of images to fetch download_folder = "./data" # folder in which the images will be stored ### Crawl and download images ### from web_crawler import WebCrawler crawler = WebCrawler(api_keys) # 1. Crawl the web and collect URLs: crawler.collect_links_from_web(keywords, images_nbr, remove_duplicated_links=True) # 2. (alernative to the previous line) Load URLs from a file instead of the web: #crawler.load_urls(download_folder + "/links.txt") #crawler.load_urls_from_json(download_folder + "/links.json") # 3. Save URLs to download them later (optional): crawler.save_urls(download_folder + "/links.txt") #crawler.save_urls_to_json(download_folder + "/links.json") # 4. Download the images: crawler.download_images(target_folder=download_folder)
EXPORT_FILE = 'rss_feeds' if len(sys.argv) == 1: print doc print 'Usage: python2 web_crawler_exporter.py [WEBSITE]' print 'where [WEBSITE] is a full url, for example: http://news.google.com' print 'See README.md for details.' print 'Output will be APPENDED to file named ' + EXPORT_FILE + '\n' if len(sys.argv) == 1: exit() master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = str(sys.argv[1]), ) WebCrawler.create_worker ( privileges = construct_full_privilege(), master = master_crawler, max_external_expansion = 1000, max_internal_expansion = 4, max_crawling_depth = 3, list_export = True, export_dicts = True, export_file = EXPORT_FILE, ) master_crawler.run()
def __init__(self): self.web_crawler = WebCrawler()
def __init__(self): super().__init__() self.logger = getLogger(self.__class__.__name__) self.reviewUrl = getConfig()['task1']['review_url'] self.web_crawler = WebCrawler()
def __init__(self): self._crawler = WebCrawler() self._parser = None
class App(): OUTPUT_FOLDER = "outputs" def __init__(self, target): super().__init__() self.crawler = WebCrawler() target_config = getConfig().get("targets", {}).get(target) self.logger = getLogger(self.__class__.__name__) if not target_config: self.logger.error("target is not found in config.") raise Exception("target is not found in config.") self.logger.info(f"Application is processing target {target}") self.target_config = target_config self.max_threads = int(getConfig()["configs"]["max_threads"]) self.sleep_time = int(self.target_config["sleep"]) self.detail_urls = [] self.items = [] def _get_urls(self): url = self.target_config['url'] page_size = self.target_config['page_size'] total_items = self.target_config['total_items'] pages = int(total_items / page_size) if total_items % page_size == 0 else int( total_items / page_size) + 1 page_param = self.target_config['page_param'] urls = tuple(f'{url}?{page_param}={ page * page_size }' for page in range(pages)) return urls def _get_detail_url(self, pageUrl): urls = [] try: self.logger.info(f"Processing url: {pageUrl}") sleep(self.sleep_time) item_type = self.target_config['item_type'] item_attr = self.target_config['item_attr'] info_attr = self.target_config['info_attr'] bs_info = self.crawler.get_parser_response(pageUrl) item_blocks = bs_info.findAll(item_type, attrs={'class': item_attr}) for item_block in item_blocks: info = item_block.find('div', attrs={'class': info_attr}) detail_url = info.find('a').get('href') if detail_url: urls.append(detail_url) else: self.logger.warning( "No detail url is found, item will be skipped.") except Exception as e: self.logger.error(f"Exception occurred on url: {pageUrl}.", e) return urls def _get_all_detail_urls(self): with ThreadPoolExecutor(max_workers=self.max_threads) as executor: tasks = [ executor.submit(self._get_detail_url, url) for url in self._get_urls() ] for future in as_completed(tasks): self.detail_urls.extend(future.result()) def _handle_item(self, url): items = [] try: self.logger.info(f"Processing detail url: {url}") sleep(self.sleep_time) selector = self.crawler.get_parser_response(url, parser='lxml') item = {} for property in self.target_config['item_properties']: if property['name'] == "Link": item[property['name']] = url else: item[property['name']] = self._handle_property( selector, property) items.append(item) except Exception as e: self.logger.error(f"Exception occurred on url: {url}.", e) return items def _handle_property(self, selector, property): name = property['name'] xpath_values = selector.xpath('//*' + property['xpath']) if name in [ 'Directors', 'ScriptWriters', 'Actors', 'Authors', 'HotComments' ]: return ' | '.join(xpath_values) elif name == 'Year': return xpath_values[0].strip().replace('(', '').replace(')', '') else: return xpath_values[0].strip() def get_items(self): self._get_all_detail_urls() with ThreadPoolExecutor(max_workers=self.max_threads) as t: jobs = [ t.submit(self._handle_item, detail_url) for detail_url in self.detail_urls ] for future in as_completed(jobs): self.items.extend(future.result()) def to_output_file(self): columns = list( map(lambda property: property["name"], self.target_config['item_properties'])) df = pandas.DataFrame(self.items, columns=columns) path = f"{os.getcwd()}/{self.OUTPUT_FOLDER}" if not os.path.exists(path): logger.debug(f"Create output folder: {self.OUTPUT_FOLDER}") os.makedirs(self.OUTPUT_FOLDER) df.to_csv(f"{path}/{self.target_config['output_file_name']}.csv", columns=columns, index=False, encoding=self.target_config['output_encoding'])
class TestWebCrawler(unittest.TestCase): ''' Test class for Sainsbury Crawler. ''' def setUp(self): ''' Initial setup method.''' self.main_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\ '.com/2015_Developer_Scrape/5_products.html' self.prd_pg_url = 'http://hiring-tests.s3-website-eu-west-1.' \ 'amazonaws.com/2015_Developer_Scrape/sainsburys-apricot-' \ 'ripe---ready-320g.html' self.crawl_obj = WebCrawler() self.prod_page_html = BeautifulSoup('<div class='\ '"productTitleDescriptionContainer"><h1>Sainsbury\'s Apricot Ripe' \ ' & Ready x5</h1></div><div class="pricing"><p class=' \ '"pricePerUnit">£3.50</p></div><div class="productText">' \ '<p>Apricots</p></div>', 'lxml') self.main_page_html = BeautifulSoup('<div class="productInfo"><a' \ ' href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \ '2015_Developer_Scrape/sainsburys-apricot-ripe---ready-320g.html"' \ '>Sainsbury\'s Apricot Ripe & Ready x5</a>' \ '<div class="productTitleDescriptionContainer"><h1>Sainsbury\'s' \ ' Apricot Ripe & Ready x5</h1></div><p class="pricePerUnit">' \ '£3.50<span class="pricePerUnitUnit">unit</span></p>' \ '<div class="productText">Apricots</div></div>' \ '<div class="productInfo"><a ' \ 'href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \ '2015_Developer_Scrape/sainsburys-avocado-xl-pinkerton-loose-' \ '300g.html">Sainsbury\'s Avocado Ripe & Ready XL Loose 300g' \ '</a><div class="productTitleDescriptionContainer"><h1>' \ 'Sainsbury\'s Avocado Ripe & Ready XL Loose 300g</h1></div>' \ '<p class="pricePerUnit">£1.50<span class="pricePerUnitUnit"'\ '>unit</span></p><div class="productText">Avocados</div></div>', 'lxml') self.dummy_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\ '.com/2015_Developer_Scrape/dummy_products.html' def test_get_page_request_object(self): '''Validate get page request method. it should return response object. ''' # Validate main product page req_obj = self.crawl_obj.get_page_request_object(self.main_pg_url) assert req_obj is not None self.assertEqual(req_obj.status_code, 200) self.assertEqual(req_obj.url, self.main_pg_url) # Validate child product page req_obj = self.crawl_obj.get_page_request_object(self.prd_pg_url) assert req_obj is not None self.assertEqual(req_obj.status_code, 200) self.assertEqual(req_obj.url, self.prd_pg_url) def test_get_products_urls(self): ''' test product urls from webpage ''' # mock html extracted using beautifulsoup library def beautifulsoup_html_mock(param1, param2): return self.main_page_html import web_crawler web_crawler.BeautifulSoup = beautifulsoup_html_mock prd_link = self.crawl_obj.get_products_urls(self.main_pg_url) self.assertEqual(len(prd_link), 2) prd_link = self.crawl_obj.get_products_urls(self.dummy_pg_url) self.assertEqual(len(prd_link), 0) # if there are not products def test_get_page_contents(self): ''' test one of the product page contents ''' # mock html extracted using beautifulsoup library def beautifulsoup_html_mock(param1, param2): return self.prod_page_html import web_crawler web_crawler.BeautifulSoup = beautifulsoup_html_mock self.crawl_obj.get_page_contents(self.prd_pg_url) final_dict = self.crawl_obj.final_dict self.assertEqual(len(final_dict), 2) self.assertEqual(final_dict.get('total'), 3.5) self.assertEqual(final_dict['result'][0].get('description'), 'Apricots') self.assertEqual(final_dict['result'][0].get('unit_price'), 3.5) self.assertEqual(final_dict['result'][0].get('title'), 'Sainsbury\'s' \ ' Apricot Ripe & Ready x5') def test_crawl_page(self): # mock html extracted using beautifulsoup library def beautifulsoup_html_mock(param1, param2): return self.main_page_html import web_crawler web_crawler.BeautifulSoup = beautifulsoup_html_mock prd_link = self.crawl_obj.get_products_urls(self.main_pg_url) self.assertEqual(len(prd_link), 2) self.crawl_obj.crawl_page(self.main_pg_url) final_dict = self.crawl_obj.final_dict self.assertEqual(len(final_dict), 2) self.assertEqual(final_dict.get('total'), 7) self.assertEqual(final_dict['result'][0].get('description'), 'Apricots') self.assertEqual(final_dict['result'][0].get('unit_price'), 3.5) self.assertEqual(final_dict['result'][0].get('title'), 'Sainsbury\'s' \ ' Apricot Ripe & Ready x5')
The name of this test means that crawler will jump often to the distant locations, increasing his depth quickly. """ import sys import time sys.path.append("../web_crawler") from web_crawler import WebCrawler sys.path.append("..") from privileges import construct_full_privilege, privileges_bigger_or_equal master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = "http://antyweb.pl/" ) WebCrawler.create_worker ( privileges = construct_full_privilege(), master = master_crawler, max_internal_expansion = 5, max_external_expansion = 3, max_crawling_depth = 100, ) master_crawler.run() time.sleep(60*60*24*3) master_crawler.terminate()
def __init__(self): self.logger = getLogger(self.__class__.__name__) self.commentUrl = getConfig()['task2']['comment_url'] self.web_crawler = WebCrawler() self.db_helper = DbHelper() self.__comments = []
"""on importe notre module web crawler qu'on a créé """ import sys from web_crawler import WebCrawler DATA_TYPE = sys.argv[1] CRAWLING_ACTIVATED = True if sys.argv[2] == '1' else False # On prend en entrée un URL if DATA_TYPE == '1': print("Starting crawler on URL") if not CRAWLING_ACTIVATED: print("(Crawling deactivated)") STARTING_URL = sys.argv[3] CRAWLER = WebCrawler() CRAWLER.crawl_site(STARTING_URL, CRAWLING_ACTIVATED) CRAWLER.print_report() # On prend en entrée un fichier local (crawling désactivé) elif DATA_TYPE == '2': print("Starting crawler on local file") print("(Crawling deactivated)") CRAWLING_ACTIVATED = False LOCAL_FILE = sys.argv[3] CRAWLER = WebCrawler() CRAWLER.crawl_local_file(LOCAL_FILE) CRAWLER.print_report() # On prend en entrée des données en stdin elif DATA_TYPE == '3': print("What type of std:in do you want to use \
) parser.add_argument( "--max_pages", help="limit the maximum pages that the crawler can parse") parser.add_argument("-v", "--verbose", help="increase the verbose output", type=bool, default=False) args = parser.parse_args() return { "url": args.url if args.url else "https://www.scrapehero.com/", "max_threads": args.max_threads if args.max_threads else min( 32, os.cpu_count() + 4), # is the max workers default value in python "max_pages": args.max_pages if args.max_pages else float("inf"), "verbosity": args.verbose, } if __name__ == "__main__": args = initialise_arguments() web_crawler = WebCrawler(**args) pprint(web_crawler.crawl())
def main(): crawl = WebCrawler(input()) crawl.spider()
from web_crawler import WebCrawler from worker import Worker config = ConfigParser.RawConfigParser() config.read('/etc/calfresh/calfresh.conf') logger = logging.getLogger('root') if __name__ == '__main__': logger.info('starting...') datapath = None for table in table_url_map.keys(): try: crawler = WebCrawler(table, table_url_map[table]) new_table_data = crawler.crawl() if new_table_data: worker = Worker(new_table_data) datapath = worker.work() except Exception as ex: logger.exception(ex) if datapath: loader = DataLoader() loader.load(datapath) crawler.clean_up() logger.info('finished')