def test_crawl(self):
     """
     Tests crawl method
     The get_html method of the html_requester class is mocked to return the contents of html_test_data.html.
     This mocking allows for inputting test html data without having to host it online.
     """
     file_util = FileUtil()
     expected_result = file_util.get_file_contents("crawl_test_data.txt")
     web_crawler = WebCrawler()
     web_crawler.html_requester.get_html = lambda url: self.mock_get_html(
         url)
     actual_result = web_crawler.crawl("http://www.domain.com")
     self.assertEqual(expected_result, actual_result)
def main():
    try:
        global wc_dev, wc_itpro, dev_index,itpro_index, ranks
        wc_dev = WebCrawler()
        wc_itpro = WebCrawler()
        dev_index=wc_dev.crawl_web("dev")
        itpro_index=wc_itpro.crawl_web("itpro")
        ranks = wc_dev.compute_ranks()
        server = HTTPServer(('192.168.1.225', 8080), OffDocsHandler)
        print 'started http server...'
        server.serve_forever()
    except KeyboardInterrupt:
        print 'shutting down server'
        server.socket.close()
Beispiel #3
0
 def __init__(self, target):
     super().__init__()
     self.crawler = WebCrawler()
     target_config = getConfig().get("targets", {}).get(target)
     self.logger = getLogger(self.__class__.__name__)
     if not target_config:
         self.logger.error("target is not found in config.")
         raise Exception("target is not found in config.")
     self.logger.info(f"Application is processing target {target}")
     self.target_config = target_config
     self.max_threads = int(getConfig()["configs"]["max_threads"])
     self.sleep_time = int(self.target_config["sleep"])
     self.detail_urls = []
     self.items = []
	def setUp(self):
		''' Initial setup method.'''

		self.main_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\
			'.com/2015_Developer_Scrape/5_products.html'
		self.prd_pg_url = 'http://hiring-tests.s3-website-eu-west-1.' \
			'amazonaws.com/2015_Developer_Scrape/sainsburys-apricot-' \
			'ripe---ready-320g.html'
		self.crawl_obj = WebCrawler()
		self.prod_page_html = BeautifulSoup('<div class='\
			'"productTitleDescriptionContainer"><h1>Sainsbury\'s Apricot Ripe' \
			' & Ready x5</h1></div><div class="pricing"><p class=' \
			'"pricePerUnit">&pound;3.50</p></div><div class="productText">' \
			'<p>Apricots</p></div>', 'lxml')
		self.main_page_html = BeautifulSoup('<div class="productInfo"><a' \
			' href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \
			'2015_Developer_Scrape/sainsburys-apricot-ripe---ready-320g.html"' \
			'>Sainsbury\'s Apricot Ripe &amp; Ready x5</a>' \
			'<div class="productTitleDescriptionContainer"><h1>Sainsbury\'s' \
			' Apricot Ripe & Ready x5</h1></div><p class="pricePerUnit">' \
			'&pound;3.50<span class="pricePerUnitUnit">unit</span></p>' \
			'<div class="productText">Apricots</div></div>' \
			'<div class="productInfo"><a ' \
			'href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \
			'2015_Developer_Scrape/sainsburys-avocado-xl-pinkerton-loose-' \
			'300g.html">Sainsbury\'s Avocado Ripe &amp; Ready XL Loose 300g' \
			'</a><div class="productTitleDescriptionContainer"><h1>' \
			'Sainsbury\'s Avocado Ripe & Ready XL Loose 300g</h1></div>' \
			'<p class="pricePerUnit">&pound;1.50<span class="pricePerUnitUnit"'\
			'>unit</span></p><div class="productText">Avocados</div></div>',
		'lxml')

		self.dummy_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\
			'.com/2015_Developer_Scrape/dummy_products.html'
class WebCrawlerTests(unittest.TestCase):

    def setUp(self):
        self.crawler = WebCrawler("hackbulgaria.com")

    def test_prepare_link(self):
        url = "http://hackbulgaria.com/"
        href = "/courses/"
        self.assertEqual(self.crawler.prepare_link(url, href),
                         "http://hackbulgaria.com/courses/")

    def test_is_outgoing_false(self):
        self.assertTrue(self.crawler.is_outgoing("http://facebook.com"))

    def test_is_outgoing_true(self):
        url = "https://hackbulgaria.com/media/content_media/"
        url += "JavaScript-Frontend-conspect.pdf"
        self.assertFalse(self.crawler.is_outgoing(url))
class SuspiciousURLCrawler:
    def __init__(self):
        self.web_crawler = WebCrawler()

    # Crawls domains that are typos of popular domains to scrape suspicious URLs
    def get_suspicious_urls(self, verbose: bool, num_urls=None) -> [str]:
        typos = reduce(set.union, map(generate_typos, _get_popular_domains()), set())
        typos = set(map(lambda typo: PROTOCOL + typo, typos))
        return self.web_crawler.scrape_links(typos, verbose, num_urls, _get_exclusion_sites())
Beispiel #7
0
def main(args):
    logging.basicConfig(filename='web_crawler_' + str(datetime.now()) + '.log',
                        filemode='w',
                        format='%(name)s - %(levelname)s - %(message)s',
                        level=logging.DEBUG)
    try:
        web_crawler_instance = WebCrawler(args[1])
        web_crawler_instance.crawl()
    except IndexError as ex:
        logging.error(
            f"An error has occurred whilst running the crawler, no URL was provided: {str(ex)}"
        )
    except DBConnectionError as ex:
        logging.error(
            f"An error has ocurred whilst connecting to DB: {str(ex)}")
    except Exception as ex:
        logging.error(
            f"An error has occurred whilst running the crawler: {str(ex)}")
    logging.info("Program finished running.")
Beispiel #8
0
    def test_crawler(self):
        web_crawler = WebCrawler(url="http://localhost:5000",
                                 max_threads=32,
                                 max_pages=float("inf"))

        self.assertEqual(
            sorted(list(web_crawler.crawl())),
            sorted([
                "http://localhost:5000/com",
                "http://localhost:5000/test",
                "http://localhost:5000/test123",
            ]),
        )

        self.assertNotEqual(
            sorted(list(web_crawler.crawl())),
            sorted([
                "https://google.com", "http://localhost:5000/com", "/test",
                "/test123"
            ]),
        )
Beispiel #9
0
 def build(cls):
     command_print("Build started")
     cls.crawler = WebCrawler()
     # run method for index
     cls.crawler.scrape_index_pages()
     # run method for all country pages
     cls.crawler.scrape_country_pages()
     # run method for all continent pages
     cls.crawler.scrape_continent_pages()
     # create the index from memory
     cls.crawler.create_index_file()
     command_print("Build completed")
Beispiel #10
0
def download_imgs():
    with open(os.path.join(COMMON_FLAGS.json_dir, 'selected_keywords.json'),
              'r') as fp:
        keywords = json.load(fp)
    print("keywords:", type(keywords), len(keywords), type(keywords[0]))

    api_keys = {
        'flickr': ('3845aa5608781b176e74bedd2a653b78', '19192eb5251a4809')
    }  # replace XXX.. and YYY.. by your own keys
    # images_nbr = 10000 # number of images to fetch
    images_nbr = 200  # 200 * 200 = 40k

    ### Crawl and download images ###
    from web_crawler import WebCrawler
    crawler = WebCrawler(api_keys, mindate=mindate, maxdate=maxdate)

    # 1. Crawl the web and collect URLs:
    crawler.collect_links_from_web(keywords,
                                   images_nbr,
                                   remove_duplicated_links=True)

    # 2. (alernative to the previous line) Load URLs from a file instead of the web:
    #crawler.load_urls(download_folder + "/links.txt")
    #crawler.load_urls_from_json(download_folder + "/links.json")

    # 3. Save URLs to download them later (optional):
    # crawler.save_urls(os.path.join(download_folder, "links.txt"))
    crawler.save_urls_to_json(
        os.path.join(url_folder, "links-%s-%s.json" % (mindate, maxdate)))
Beispiel #11
0
class HttpBin():
    def __init__(self, base_url):
        super().__init__()
        self.logger = getLogger(self.__class__.__name__)
        self.crawler = WebCrawler()
        self.base_url = base_url

    def request_json(self, method="get", params={}):
        return self.crawler.get_response_json(
            f"{self.base_url}/{method.lower()}",
            method=method,
            params=params,
            headers={})
Beispiel #12
0
class SearchEngine(object):
    ''' SearchEngine is the main class of this system.
        Methods:
            add_page() -- Add a new html files in the file system of OS.
            search() -- Search key word in the system, return all the url and file contain this word.
        Attributes:
            inverted_index -- The data structure maintains all the terms and the coorespending URL/File
            crawler -- A object that parsing html format input to raw text, 
                       and then preprocess the raw text into a array of words which can save into inverted_index.
    '''

    def __init__(self):
        self.inverted_index = InvertedIndex()
        self.crawler = WebCrawler()

    def add_page(self, page):
        if page.startswith('http'): ## if input with http, it is a url, otherwise if a file path
            pass
            words = self.crawler.add_url(page)
            self.inverted_index.add(page, words)
        else:
            words = self.crawler.add_pages(page)
            self.inverted_index.add(page, words)
            print()
            self.inverted_index.print_trie()

    def search(self, word):
        outs = self.inverted_index.search(word)
        if outs:
            print('Searching key word is {}'.format(word))
            for i,out in enumerate(outs):
                print('{}. {}'.format(i+1, out))
            print()
        else:
            print('Can not find key {} in SearchEngine.\n'.format(word))

    def print_trie(self):
        self.inverted_index.print_trie()
Beispiel #13
0
def web_crawler_main():
    """
        check user input and start WebCrawler
    """
    opts, args = get_args()
    logger = get_logger()

    url = add_valid_protocol_prefix(opts.url)
    depth_limit = opts.depth_limit if 0 < opts.depth_limit <= DEFAULT__DEPTH_LIMIT else None
    time_out = opts.time_out if 0 < opts.time_out else None

    if not url or not depth_limit or not time_out:
        if not url:
            logger.error("invalid page address")
        if not depth_limit:
            logger.error("invalid depth limit")
        if not time_out:
            logger.error("invalid time out")
        raise SystemExit(1)

    domain_name = get_sub_domain_name(url)
    web_crawler = WebCrawler(url, domain_name, depth_limit, time_out, logger)
    web_crawler.start()
Beispiel #14
0
class DoubanBookComments:
    def __init__(self):
        self.logger = getLogger(self.__class__.__name__)
        self.commentUrl = getConfig()['task2']['comment_url']
        self.web_crawler = WebCrawler()
        self.db_helper = DbHelper()
        self.__comments = []

    def __process_comments(self):
        selector = self.web_crawler.get_parser_response(self.commentUrl,
                                                        parser='lxml')
        commentElements = selector.xpath(
            "//div[@id='comments']/ul/li[@class='comment-item']")
        title = selector.xpath("//div[@id='content']/h1[1]/text()")[0].split(
            ' ')[0]
        for commentEle in commentElements:
            try:
                score = commentEle.xpath("div[2]/h3[1]/span[2]/span[1]/@class"
                                         )[0].split(' ')[1].replace(
                                             'allstar', '').replace('0', '')
                content = commentEle.xpath(
                    "div[2]/p[1]/span[1]//text()")[0].strip()
                comment = Comment(**{
                    'title': title,
                    'content': content,
                    'score': score
                })
                self.__comments.append(comment)
            except:
                self.logger.error("Invalid element, skip...")

    def comments(self):
        if not self.__comments:
            self.__process_comments()
        return self.__comments

    def __sentiment(self, text):
        return SnowNLP(text).sentiments

    def sentiment(self):
        comments = [comment.to_dict() for comment in self.comments()]
        df = pd.DataFrame(comments)
        df['sentiment'] = df['content'].apply(self.__sentiment)
        print(f'Average sentiment score: {df.sentiment.mean()}')

    def store(self):
        comments = self.comments()
        for comment in comments:
            self.db_helper.insert(comment)
        self.db_helper.close()
Beispiel #15
0
def main():
    parser = ArgumentParser()
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-d", "--depth", type=int, help="limit crawling by depth of directory tree (default, 10)")
    group.add_argument("-c", "--count", type=int, help="limit crawling by number of pages")
    parser.add_argument("url_list", help="file containing urls separated by newlines")
    parser.add_argument("-v", "--verbose", action="store_true", help="set verbosity of program")
    parser.add_argument("-p", "--max-processes", type=int, help="maximum number of processes to run in parallel (default is 10)")
    parser.add_argument("-t", "--max-threads", type=int, help="maximum number of threads per process (default is 20)")
    args = parser.parse_args()

    # check if url_list file exists and that user has permission to read it
    if not os.path.isfile(args.url_list) or not os.access(args.url_list, os.R_OK):
        print("[-] File does not exist: {}".format(args.url_list))
        sys.exit(1)

    # get url list
    urls = list()
    with open(args.url_list, "r") as url_list_file:
        for url in url_list_file:
            urls.append(url.strip())

    crawler = WebCrawler(urls)

    # set custom parameters
    if args.max_processes:
        crawler.max_processes = args.max_processes
    if args.max_threads:
        crawler.max_threads = args.max_threads
    if args.verbose:
        crawler.verbose = True
    if args.depth:
        crawler.limit = "depth"
        crawler.limit_param = args.depth
    elif args.count:
        crawler.limit = "count"
        crawler.limit_param = args.count

    crawler.start()
    sys.exit(0)
Beispiel #16
0
 def test_get_host(self):
     web_crawler = WebCrawler.get_host("http://localhost:5000")
     self.assertEqual(web_crawler, "localhost:5000")
 def setUp(self):
     self.crawler = WebCrawler("hackbulgaria.com")
web_crawler_test_2_wykop.py
This test precisely explores www.wykop.pl in search for RSS feeds.
"""

import sys
import time

sys.path.append("../web_crawler")
from web_crawler import WebCrawler

sys.path.append("..")
from privileges import construct_full_privilege, privileges_bigger_or_equal


master_crawler = WebCrawler.create_master (
    privileges = construct_full_privilege(),
    start_url = "http://www.wykop.pl/"
)


WebCrawler.create_worker (
    privileges = construct_full_privilege(),
    master = master_crawler,
    max_crawling_depth = 3
)

master_crawler.run()

time.sleep(60*60*24*3)
master_crawler.terminate()

Beispiel #19
0
from configuration.config import Config
from web_crawler import WebCrawler
from crawler_observers.crawler_printer import CrawlerPrinter


if __name__ == '__main__':
    WebCrawler.crawl(Config.SEED_URL, CrawlerPrinter())
Beispiel #20
0
class DoubanBookReviews:
    def __init__(self):
        super().__init__()
        self.logger = getLogger(self.__class__.__name__)
        self.reviewUrl = getConfig()['task1']['review_url']
        self.web_crawler = WebCrawler()

    def get_reviews(self):
        selector = self.web_crawler.get_parser_response(self.reviewUrl,
                                                        parser='lxml')
        reviewEles = selector.xpath('//div[@class=\"short-content\"]')
        reviews = []
        for ele in reviewEles:
            review = ele.text.strip().replace('&nbsp', '').replace(
                '\n', '').replace('\r', '').replace('(',
                                                    '').replace('...', '')
            if review and review != '':
                reviews.append(review)
        return ''.join(reviews)

    def get_keywords(self):
        keys = [
            key for key in analyse.extract_tags(self.get_reviews(),
                                                topK=getConfig()['task1'].get(
                                                    'topK', 10),
                                                withWeight=False)
        ]
        return keys

    def generate_word_cloud(self, fileName=None):
        text_string = ','.join(self.get_keywords())
        wc = WordCloud(width=600,
                       height=200,
                       margin=2,
                       ranks_only=None,
                       prefer_horizontal=0.9,
                       mask=None,
                       color_func=None,
                       max_words=200,
                       stopwords=None,
                       random_state=None,
                       background_color='#ffffff',
                       font_step=1,
                       mode='RGB',
                       regexp=None,
                       collocations=True,
                       normalize_plurals=True,
                       contour_width=0,
                       colormap='viridis',
                       contour_color='Blues',
                       repeat=False,
                       scale=2,
                       min_font_size=10,
                       max_font_size=200,
                       font_path=os.environ.get(
                           'FONT_PATH',
                           os.path.join(os.path.dirname(__file__),
                                        'PingFang.ttc')))

        wc.generate_from_text(text_string)
        if fileName:
            ROOT_DIR = os.getcwd()
            path = os.path.join(ROOT_DIR, "output")
            if not os.path.exists(path):
                os.makedirs("output")
            wc.to_file(os.path.join(path, fileName))
        else:
            plt.imshow(wc, interpolation='bilinear')
            plt.axis('off')
            plt.tight_layout()
            plt.show()
web_crawler_test_1_rss_wp.py
This simple test fetches 10 RSS feeds from rss.wp.pl website and then quits.
"""

import sys
import time

sys.path.append("../web_crawler")
from web_crawler import WebCrawler

sys.path.append("..")
from privileges import construct_full_privilege, privileges_bigger_or_equal


master_crawler = WebCrawler.create_master (
    privileges = construct_full_privilege(),
    start_url = "http://rss.wp.pl/"
)


WebCrawler.create_worker (
    master = master_crawler,
    privileges = construct_full_privilege(),
    max_internal_expansion = 10,
    max_database_updates = 10
)

master_crawler.run()

time.sleep(120)
master_crawler.terminate()
Beispiel #22
0
class TestWebCrawler(unittest.TestCase):
    def setUp(self):
        self.crawler = WebCrawler(
            table='tbl_dfa256',
            url=table_url_map['tbl_dfa256'],
        )

    def test_crawl(self):
        pass

    def test_get_new_page(self):
        filepath = os.path.join(
            '/etc/calfresh/temp',
            self.crawler.table + '_' + str(datetime.date.today()),
        )

        new_page_path = self.crawler._get_new_page()
        self.assertEqual(new_page_path, filepath)

        self.crawler.url = None
        self.assertIsNone(self.crawler._get_new_page())

    def test_get_old_page(self):
        filepath = os.path.join(
            '/etc/calfresh/temp',
            self.crawler.table + '_' +
            str(datetime.date.today() - datetime.timedelta(days=1)),
        )

        old_page_path = self.crawler._get_old_page()
        self.assertEqual(old_page_path, filepath)

    def test_download_new_files(self):
        pass

    def test_get_filename(self):
        good_url1 = '/9/DSSDB/DataTables/DFA256FY17-18.xlsx?ver=2018-06-08-125617-450'
        good_url2 = '/9/DSSDB/DataTables/DFA256FY14-15.xls'

        good_filename1 = self.crawler._get_filename(good_url1)
        good_filename2 = self.crawler._get_filename(good_url2)

        self.assertEqual(good_filename1, 'DFA256FY17-18.xlsx')
        self.assertEqual(good_filename2, 'DFA256FY14-15.xls')

    def test_clean_up(self):
        two_days_ago = str(datetime.date.today() - datetime.timedelta(days=2))
        filepath1 = os.path.join('/etc/calfresh/temp', 'junk1_' + two_days_ago)
        filepath2 = os.path.join('/etc/calfresh/temp', 'junk2_' + two_days_ago)

        with open(filepath1, 'w') as fd:
            fd.write('test1')
        with open(filepath2, 'w') as fd:
            fd.write('test2')

        for root, dirs, files in os.walk('/etc/calfresh/temp'):
            self.assertIn('junk1_' + two_days_ago, files)
            self.assertIn('junk2_' + two_days_ago, files)
            for file in files:
                if file.endswith(str(two_days_ago)):
                    os.remove(os.path.join('/etc/calfresh/temp', file))

        for root, dirs, files in os.walk('/etc/calfresh/temp'):
            self.assertNotIn('junk1_' + two_days_ago, files)
            self.assertNotIn('junk2_' + two_days_ago, files)
Beispiel #23
0
 def setUp(self):
     self.crawler = WebCrawler(
         table='tbl_dfa256',
         url=table_url_map['tbl_dfa256'],
     )
Beispiel #24
0
 def __init__(self, base_url):
     super().__init__()
     self.logger = getLogger(self.__class__.__name__)
     self.crawler = WebCrawler()
     self.base_url = base_url
Beispiel #25
0
 def __init__(self):
     self.inverted_index = InvertedIndex()
     self.crawler = WebCrawler()
Beispiel #26
0
# -*0 coding: cp949 -*-
from web_crawler import WebCrawler


if __name__ == "__main__":

    web_crawler = WebCrawler('http://postgame.tistory.com', 'titleWrap', 'web_crawler.html')
    if web_crawler.proc_chkupdate() is True:
        print 'No update!'
    else:
        print 'New update!'
#!/usr/bin/env python

keywords = ["cats", "dogs", "birds"]
api_keys = {'google': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY'),
            'flickr': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY')}
images_nbr = 10 # number of images to fetch
download_folder = "./data" # folder in which the images will be stored

### Crawl and download images ###
from web_crawler import WebCrawler
crawler = WebCrawler(api_keys)

# 1. Crawl the web and collect URLs:
crawler.collect_links_from_web(keywords, images_nbr, remove_duplicated_links=True)

# 2. (alernative to the previous line) Load URLs from a file instead of the web:
#crawler.load_urls(download_folder + "/links.txt")
#crawler.load_urls_from_json(download_folder + "/links.json")

# 3. Save URLs to download them later (optional):
crawler.save_urls(download_folder + "/links.txt")
#crawler.save_urls_to_json(download_folder + "/links.json")

# 4. Download the images:
crawler.download_images(target_folder=download_folder)


### Build the dataset ###
from dataset_builder import DatasetBuilder
dataset_builder = DatasetBuilder()
Beispiel #28
0
#!/usr/bin/env python

keywords = ["cats", "dogs", "birds"]
api_keys = {
    'google': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY'),
    'flickr': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY')
}
images_nbr = 10  # number of images to fetch
download_folder = "./data"  # folder in which the images will be stored

### Crawl and download images ###
from web_crawler import WebCrawler

crawler = WebCrawler(api_keys)

# 1. Crawl the web and collect URLs:
crawler.collect_links_from_web(keywords,
                               images_nbr,
                               remove_duplicated_links=True)

# 2. (alernative to the previous line) Load URLs from a file instead of the web:
#crawler.load_urls(download_folder + "/links.txt")
#crawler.load_urls_from_json(download_folder + "/links.json")

# 3. Save URLs to download them later (optional):
crawler.save_urls(download_folder + "/links.txt")
#crawler.save_urls_to_json(download_folder + "/links.json")

# 4. Download the images:
crawler.download_images(target_folder=download_folder)
EXPORT_FILE = 'rss_feeds'

if len(sys.argv) == 1:
    print doc
    print 'Usage: python2 web_crawler_exporter.py [WEBSITE]'
    print 'where [WEBSITE] is a full url, for example: http://news.google.com'
    print 'See README.md for details.'

print 'Output will be APPENDED to file named ' + EXPORT_FILE + '\n'

if len(sys.argv) == 1:
    exit()

master_crawler = WebCrawler.create_master (
    privileges = construct_full_privilege(),
    start_url = str(sys.argv[1]),
)

WebCrawler.create_worker (
    privileges = construct_full_privilege(),
    master = master_crawler,
    max_external_expansion = 1000,
    max_internal_expansion = 4,
    max_crawling_depth = 3,
    list_export = True,
    export_dicts = True,
    export_file = EXPORT_FILE,
)

master_crawler.run()
 def __init__(self):
     self.web_crawler = WebCrawler()
Beispiel #31
0
 def __init__(self):
     super().__init__()
     self.logger = getLogger(self.__class__.__name__)
     self.reviewUrl = getConfig()['task1']['review_url']
     self.web_crawler = WebCrawler()
Beispiel #32
0
 def __init__(self):
     self._crawler = WebCrawler()
     self._parser = None
Beispiel #33
0
class App():

    OUTPUT_FOLDER = "outputs"

    def __init__(self, target):
        super().__init__()
        self.crawler = WebCrawler()
        target_config = getConfig().get("targets", {}).get(target)
        self.logger = getLogger(self.__class__.__name__)
        if not target_config:
            self.logger.error("target is not found in config.")
            raise Exception("target is not found in config.")
        self.logger.info(f"Application is processing target {target}")
        self.target_config = target_config
        self.max_threads = int(getConfig()["configs"]["max_threads"])
        self.sleep_time = int(self.target_config["sleep"])
        self.detail_urls = []
        self.items = []

    def _get_urls(self):
        url = self.target_config['url']
        page_size = self.target_config['page_size']
        total_items = self.target_config['total_items']
        pages = int(total_items /
                    page_size) if total_items % page_size == 0 else int(
                        total_items / page_size) + 1
        page_param = self.target_config['page_param']
        urls = tuple(f'{url}?{page_param}={ page * page_size }'
                     for page in range(pages))
        return urls

    def _get_detail_url(self, pageUrl):
        urls = []
        try:
            self.logger.info(f"Processing url: {pageUrl}")
            sleep(self.sleep_time)
            item_type = self.target_config['item_type']
            item_attr = self.target_config['item_attr']
            info_attr = self.target_config['info_attr']
            bs_info = self.crawler.get_parser_response(pageUrl)
            item_blocks = bs_info.findAll(item_type,
                                          attrs={'class': item_attr})
            for item_block in item_blocks:
                info = item_block.find('div', attrs={'class': info_attr})
                detail_url = info.find('a').get('href')
                if detail_url:
                    urls.append(detail_url)
                else:
                    self.logger.warning(
                        "No detail url is found, item will be skipped.")
        except Exception as e:
            self.logger.error(f"Exception occurred on url: {pageUrl}.", e)
        return urls

    def _get_all_detail_urls(self):
        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
            tasks = [
                executor.submit(self._get_detail_url, url)
                for url in self._get_urls()
            ]
            for future in as_completed(tasks):
                self.detail_urls.extend(future.result())

    def _handle_item(self, url):
        items = []
        try:
            self.logger.info(f"Processing detail url: {url}")
            sleep(self.sleep_time)
            selector = self.crawler.get_parser_response(url, parser='lxml')
            item = {}
            for property in self.target_config['item_properties']:
                if property['name'] == "Link":
                    item[property['name']] = url
                else:
                    item[property['name']] = self._handle_property(
                        selector, property)
            items.append(item)
        except Exception as e:
            self.logger.error(f"Exception occurred on url: {url}.", e)
        return items

    def _handle_property(self, selector, property):
        name = property['name']
        xpath_values = selector.xpath('//*' + property['xpath'])
        if name in [
                'Directors', 'ScriptWriters', 'Actors', 'Authors',
                'HotComments'
        ]:
            return ' | '.join(xpath_values)
        elif name == 'Year':
            return xpath_values[0].strip().replace('(', '').replace(')', '')
        else:
            return xpath_values[0].strip()

    def get_items(self):
        self._get_all_detail_urls()
        with ThreadPoolExecutor(max_workers=self.max_threads) as t:
            jobs = [
                t.submit(self._handle_item, detail_url)
                for detail_url in self.detail_urls
            ]
            for future in as_completed(jobs):
                self.items.extend(future.result())

    def to_output_file(self):
        columns = list(
            map(lambda property: property["name"],
                self.target_config['item_properties']))
        df = pandas.DataFrame(self.items, columns=columns)
        path = f"{os.getcwd()}/{self.OUTPUT_FOLDER}"
        if not os.path.exists(path):
            logger.debug(f"Create output folder: {self.OUTPUT_FOLDER}")
            os.makedirs(self.OUTPUT_FOLDER)
        df.to_csv(f"{path}/{self.target_config['output_file_name']}.csv",
                  columns=columns,
                  index=False,
                  encoding=self.target_config['output_encoding'])
class TestWebCrawler(unittest.TestCase):
	''' Test class for Sainsbury Crawler. '''

	def setUp(self):
		''' Initial setup method.'''

		self.main_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\
			'.com/2015_Developer_Scrape/5_products.html'
		self.prd_pg_url = 'http://hiring-tests.s3-website-eu-west-1.' \
			'amazonaws.com/2015_Developer_Scrape/sainsburys-apricot-' \
			'ripe---ready-320g.html'
		self.crawl_obj = WebCrawler()
		self.prod_page_html = BeautifulSoup('<div class='\
			'"productTitleDescriptionContainer"><h1>Sainsbury\'s Apricot Ripe' \
			' & Ready x5</h1></div><div class="pricing"><p class=' \
			'"pricePerUnit">&pound;3.50</p></div><div class="productText">' \
			'<p>Apricots</p></div>', 'lxml')
		self.main_page_html = BeautifulSoup('<div class="productInfo"><a' \
			' href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \
			'2015_Developer_Scrape/sainsburys-apricot-ripe---ready-320g.html"' \
			'>Sainsbury\'s Apricot Ripe &amp; Ready x5</a>' \
			'<div class="productTitleDescriptionContainer"><h1>Sainsbury\'s' \
			' Apricot Ripe & Ready x5</h1></div><p class="pricePerUnit">' \
			'&pound;3.50<span class="pricePerUnitUnit">unit</span></p>' \
			'<div class="productText">Apricots</div></div>' \
			'<div class="productInfo"><a ' \
			'href="http://hiring-tests.s3-website-eu-west-1.amazonaws.com/' \
			'2015_Developer_Scrape/sainsburys-avocado-xl-pinkerton-loose-' \
			'300g.html">Sainsbury\'s Avocado Ripe &amp; Ready XL Loose 300g' \
			'</a><div class="productTitleDescriptionContainer"><h1>' \
			'Sainsbury\'s Avocado Ripe & Ready XL Loose 300g</h1></div>' \
			'<p class="pricePerUnit">&pound;1.50<span class="pricePerUnitUnit"'\
			'>unit</span></p><div class="productText">Avocados</div></div>',
		'lxml')

		self.dummy_pg_url = 'http://hiring-tests.s3-website-eu-west-1.amazonaws'\
			'.com/2015_Developer_Scrape/dummy_products.html'

	def test_get_page_request_object(self):
		'''Validate get page request method. it should return response object.
		'''
		# Validate main product page
		req_obj = self.crawl_obj.get_page_request_object(self.main_pg_url)
		assert req_obj is not None
		self.assertEqual(req_obj.status_code, 200)
		self.assertEqual(req_obj.url, self.main_pg_url)
		
		# Validate child product page
		req_obj = self.crawl_obj.get_page_request_object(self.prd_pg_url)
		assert req_obj is not None
		self.assertEqual(req_obj.status_code, 200)
		self.assertEqual(req_obj.url, self.prd_pg_url)

	def test_get_products_urls(self):
		''' test product urls from webpage '''

		# mock html extracted using beautifulsoup library
		def beautifulsoup_html_mock(param1, param2):
			return self.main_page_html
		import web_crawler
		web_crawler.BeautifulSoup = beautifulsoup_html_mock
		prd_link = self.crawl_obj.get_products_urls(self.main_pg_url)
		self.assertEqual(len(prd_link), 2)
	
		prd_link = self.crawl_obj.get_products_urls(self.dummy_pg_url)
		self.assertEqual(len(prd_link), 0) # if there are not products

	def test_get_page_contents(self):
		''' test one of the product page contents '''
		
		# mock html extracted using beautifulsoup library
		def beautifulsoup_html_mock(param1, param2):
			return self.prod_page_html

		import web_crawler
		web_crawler.BeautifulSoup = beautifulsoup_html_mock
		self.crawl_obj.get_page_contents(self.prd_pg_url)
		final_dict = self.crawl_obj.final_dict
		self.assertEqual(len(final_dict), 2)
		self.assertEqual(final_dict.get('total'), 3.5)
		self.assertEqual(final_dict['result'][0].get('description'), 'Apricots')
		self.assertEqual(final_dict['result'][0].get('unit_price'), 3.5)
		self.assertEqual(final_dict['result'][0].get('title'), 'Sainsbury\'s' \
			' Apricot Ripe & Ready x5')

	def test_crawl_page(self):
		
		# mock html extracted using beautifulsoup library
		def beautifulsoup_html_mock(param1, param2):
			return self.main_page_html
		
		import web_crawler
		web_crawler.BeautifulSoup = beautifulsoup_html_mock
		
		prd_link = self.crawl_obj.get_products_urls(self.main_pg_url)
		self.assertEqual(len(prd_link), 2)

		self.crawl_obj.crawl_page(self.main_pg_url)
		final_dict = self.crawl_obj.final_dict
		self.assertEqual(len(final_dict), 2)
		
		self.assertEqual(final_dict.get('total'), 7)
		self.assertEqual(final_dict['result'][0].get('description'), 'Apricots')
		self.assertEqual(final_dict['result'][0].get('unit_price'), 3.5)
		self.assertEqual(final_dict['result'][0].get('title'), 'Sainsbury\'s' \
			' Apricot Ripe & Ready x5')
The name of this test means that crawler will jump often to the distant
locations, increasing his depth quickly.
"""

import sys
import time

sys.path.append("../web_crawler")
from web_crawler import WebCrawler

sys.path.append("..")
from privileges import construct_full_privilege, privileges_bigger_or_equal


master_crawler = WebCrawler.create_master (
    privileges = construct_full_privilege(),
    start_url = "http://antyweb.pl/"
)


WebCrawler.create_worker (
    privileges = construct_full_privilege(),
    master = master_crawler,
    max_internal_expansion = 5,
    max_external_expansion = 3,
    max_crawling_depth = 100,
)

master_crawler.run()

time.sleep(60*60*24*3)
master_crawler.terminate()
Beispiel #36
0
 def __init__(self):
     self.logger = getLogger(self.__class__.__name__)
     self.commentUrl = getConfig()['task2']['comment_url']
     self.web_crawler = WebCrawler()
     self.db_helper = DbHelper()
     self.__comments = []
"""on importe notre module web crawler qu'on a créé
   """
import sys
from web_crawler import WebCrawler

DATA_TYPE = sys.argv[1]
CRAWLING_ACTIVATED = True if sys.argv[2] == '1' else False

# On prend en entrée un URL
if DATA_TYPE == '1':
    print("Starting crawler on URL")
    if not CRAWLING_ACTIVATED:
        print("(Crawling deactivated)")
    STARTING_URL = sys.argv[3]
    CRAWLER = WebCrawler()
    CRAWLER.crawl_site(STARTING_URL, CRAWLING_ACTIVATED)
    CRAWLER.print_report()

# On prend en entrée un fichier local (crawling désactivé)
elif DATA_TYPE == '2':
    print("Starting crawler on local file")
    print("(Crawling deactivated)")
    CRAWLING_ACTIVATED = False
    LOCAL_FILE = sys.argv[3]
    CRAWLER = WebCrawler()
    CRAWLER.crawl_local_file(LOCAL_FILE)
    CRAWLER.print_report()

# On prend en entrée des données en stdin
elif DATA_TYPE == '3':
    print("What type of std:in do you want to use \
Beispiel #38
0
    )
    parser.add_argument(
        "--max_pages",
        help="limit the maximum pages that the crawler can parse")
    parser.add_argument("-v",
                        "--verbose",
                        help="increase the verbose output",
                        type=bool,
                        default=False)

    args = parser.parse_args()

    return {
        "url":
        args.url if args.url else "https://www.scrapehero.com/",
        "max_threads":
        args.max_threads if args.max_threads else min(
            32,
            os.cpu_count() + 4),  # is the max workers default value in python
        "max_pages":
        args.max_pages if args.max_pages else float("inf"),
        "verbosity":
        args.verbose,
    }


if __name__ == "__main__":
    args = initialise_arguments()
    web_crawler = WebCrawler(**args)
    pprint(web_crawler.crawl())
Beispiel #39
0
def main():
    crawl = WebCrawler(input())
    crawl.spider()
Beispiel #40
0
from web_crawler import WebCrawler
from worker import Worker

config = ConfigParser.RawConfigParser()
config.read('/etc/calfresh/calfresh.conf')

logger = logging.getLogger('root')

if __name__ == '__main__':

    logger.info('starting...')

    datapath = None
    for table in table_url_map.keys():
        try:
            crawler = WebCrawler(table, table_url_map[table])
            new_table_data = crawler.crawl()

            if new_table_data:
                worker = Worker(new_table_data)
                datapath = worker.work()

        except Exception as ex:
            logger.exception(ex)

    if datapath:
        loader = DataLoader()
        loader.load(datapath)

    crawler.clean_up()
    logger.info('finished')