def main():
    _setup_logging()
    downloader = SeleniumHTMLDownloader(
        r'C:\Users\hananavr\Documents\לימודים\מעבדה בין תחומית\facebookCrawler copy\crawler\chromedriver.exe'
    )

    # mongodb://<dbuser>:<dbpassword>@ds215633.mlab.com:15633/pytheas
    store = MongoItemStore(host='ds215633.mlab.com',
                           port='15633',
                           db='pytheas',
                           article_collection='hanan',
                           username='******',
                           password='******')

    items_loader = MongoItemsLoader(host='ds215633.mlab.com',
                                    port='15633',
                                    db='pytheas',
                                    items_collection='hanan',
                                    username='******',
                                    password='******')
    crawler = Crawler(downloader, {
        'www.facebook.com/': FacebookParser(),
        'www.twitter.com/': TwitterParser()
    }, store, items_loader)
    crawler.crawl(FACEBOOK_PAGE_TO_DOWNLOAD_FROM)
def crawl(url, output_dir, depth=2, method="normal", gecko_path="geckodriver", page_name=None, custom_stats_handler=None, custom_process_handler=None):
    head_handlers = {}
    get_handlers = {}

    # get name of page for sub-directories etc. if not custom name given
    if page_name is None:
        page_name = urlparse(url).netloc

    get_handlers['application/pdf'] = LocalStoragePDFHandler(
        directory=output_dir, subdirectory=page_name)

    if custom_stats_handler is None:
        head_handlers['application/pdf'] = CSVStatsPDFHandler(directory=output_dir, name=page_name)
    else:
        for content_type, Handler in custom_stats_handler.items():
            head_handlers[content_type] = Handler

    if custom_process_handler is None:
        process_handler = ProcessHandler()
    else:
        process_handler = custom_process_handler

    if not get_handlers and not head_handlers:
        raise ValueError('You did not specify any output')

    crawler = Crawler(
        downloader=requests_downloader,
        head_handlers=head_handlers,
        get_handlers=get_handlers,
        follow_foreign_hosts=False,
        crawl_method=method,
        gecko_path=gecko_path,
        process_handler=process_handler
    )
    crawler.crawl(url, depth)
Beispiel #3
0
def main():
    _setup_logging()
    downloader = SeleniumHTMLDownloader('./lib/chromedriver.exe')
    store = MongoArticleStore("localhost", 27017, "Crawler", "Articles")
    crawler = Crawler(downloader, {
        'ynet.co.il': YnetParser()
    }, store)
    crawler.crawl('https://www.ynet.co.il')
Beispiel #4
0
    def test_lucky_path(self):
        # Simulates parsed pages with posts represented by ints in range [page_index*7, page_index*7+7[
        crawler = Crawler(FetcherMock(lambda x: range(x*7, (x+1)*7)), ParserMock())

        # Check when number of posts is not a multiple of posts per page
        result = crawler.crawl(32)
        self.assertEqual(result, list(range(32)))

        # Check when number of posts is a multiple of posts per page
        result = crawler.crawl(21)
        self.assertEqual(result, list(range(21)))
Beispiel #5
0
def crawl(url, cfgs):
    click.secho("Crawler will begin on '{url}' with below settings:\n".format(url=url), fg='green')
    config = configs.load_config_section(config_section=cfgs)
    if config is None:
        print(f"Invalid config {cfgs}. Switching to DEFAULT.")
        config = configs.load_config_section(config_section='DEFAULT')
    else:
        print(f"Config set {cfgs} loaded.")
    click.echo()
    crawler = Crawler()
    print(f"Target URL = {url}")
    crawler.crawl(url, config['traversal'], config['user_agent'], int(config['max_depth']), int(config['max_total']))
Beispiel #6
0
def main():

    logging.config.fileConfig("logging.conf")
    logger = logging.getLogger("sLogger")

    logger.info("Crawling started.")
    crawler = Crawler()
    crawler.crawl()
    logger.info("Crawling finished.")

    logger.info("Generating CSV files...")
    generate_data.generate()
    logger.info("CSV Files are generated.")

    sys.exit()
Beispiel #7
0
def test(url, vector_input, sensitive_input, random, speed):
	"""Uses provided vectors and input to test against target."""
	# TODO(piper): pass files in to test.
	c = Crawler(url[0], auth=True)

	if vector_input:
		vectored = c.crawl([Gatherer()])
		
	if sensitive_input:
		[print(line) for line in sensitive_input]
	
	# result = c.crawl([VectorGatherer()])
	print("Finished testing...")
Beispiel #8
0
 def test_searcher_with_seen_urls(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \
                                      '<a href=https://scala11.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['scala'], {}, 2)
             test_crawler.seen_urls.add(Page(URL('http://scala-lang.org')))
             test_result = test_crawler.crawl()
             test_crawler.close()
             assert 'http://scala-lang.org' not in test_result
Beispiel #9
0
 def test_crawler_zero_result(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=https://scala1.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['dog'],
                 {},
                 2)
             test_result = test_crawler.crawl()
             test_crawler.close()
             self.assertEqual(test_result, set())
Beispiel #10
0
 def test_update_parents(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=http://a/c/></a>' \
                                      '<a href=http://a/b/></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
         test_crawler = Crawler(
             'http://a',
             [''], {}, max_urls_count=3)
         test_result = test_crawler.crawl()
         test_crawler.close()
         for page in test_result:
             if page.parent:
                 self.assertEqual(page.parent,
                                  Page(URL('http://a')))
Beispiel #11
0
    def run():
        """Execute the cli action

        Returns:
            None
        """
        parser = argparse.ArgumentParser(
            description="Simple one domain web crawler by Jonathan Harden")

        parser.add_argument(
            "domain",
            help=
            "Domain to crawl (will not leave the subdomain specified and will ignore any path part)",
        )
        parser.add_argument(
            "-v",
            "--verbose",
            action="store_true",
            help="Verbose, print INFO level logging",
        )

        args = parser.parse_args()

        if args.verbose:
            logging.basicConfig(level=logging.INFO)

        crawler = Crawler(args.domain)
        crawler.crawl()

        for page in crawler.site_map.all_pages():
            print("Page: {}".format(page.link))
            print("    Outbound Links:")
            for out_link in set(page.out_links):
                print("        {}".format(out_link))

            print("\n\n")
Beispiel #12
0
def main(config):
    login_args = '{student} {password} {semester}'.format(**config['account'])
    cookie = loginceiba.info(*login_args.split())

    if cookie == 1:
        print("can't login!!")
        return

    print('login_success, cookie:')
    print(cookie)

    crawler = Crawler(cookie.strip('\n'))
    courses = crawler.crawl()

    notifications, calendars, downloads = diff.diff(courses[0], [])
    notifier = Notifier()
    notifier.show_diff_notifications(notifications)
    downloadfile.downloadfile(downloads)
Beispiel #13
0
def discover(url, common_words=None):
	"""Retrieves information from the provided URL."""
	print("Beginning...")
	c = Crawler(url, auth=True)

	result = c.crawl([CountGatherer(), GuessingGatherer(), Gatherer(), URLParamsGatherer()])
	print("FINISHED CRAWLING")

	print("")
	print("")
	print("")
	print(result)
	print("")
	print("")
	print("")

	r = requests.get(url, verify=False)
	report(inputs, None, cookies)
Beispiel #14
0
 def test_searcher_with_result(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=https://scala1.html></a>' \
                                      '<a href=https://scala2.html></a>' \
                                      '<a href=https://scala3.html></a>' \
                                      '<a href=https://scala4.html></a>' \
                                      '<a href=https://scala5.html></a>' \
                                      '<a href=https://scala6.html></a>' \
                                      '<a href=https://scala7.html></a>' \
                                      '<a href=https://scala8.html></a>' \
                                      '<a href=https://scala9.html></a>' \
                                      '<a href=https://scala10.html></a>' \
                                      '<a href=https://scala11.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['scala'], {})
             test_result = test_crawler.crawl()
             test_crawler.close()
             self.assertEqual(len(test_result), 10)
Beispiel #15
0
                                  default='pages',
                                  help='Directory for downloaded pages')
    arguments_parser.add_argument('-g', action='store_true', help='Show graph')
    arguments_parser.add_argument('-w',
                                  action='store_true',
                                  help='Save founded pages')
    args = arguments_parser.parse_args()
    white_domains = []
    for domain in args.wildcard:
        if domain.startswith('*'):
            white_domains.append(re.compile(fr'[^.]+.{domain[1::]}'))
        else:
            white_domains.append(domain)
    if args.start_url[-1] == '/':
        url = args.start_url[:-1]
    else:
        url = args.start_url
    crawler = Crawler(url, args.request, white_domains, args.d, args.f, args.w)
    try:
        result = crawler.crawl()
        if args.g:
            show_graph(result)
        for link in result:
            print(link)
        print('Program is completed')
        plt.show()
    except KeyboardInterrupt:
        print('Program is completed')
    finally:
        crawler.close()
Beispiel #16
0
import os
import sys
from crawler.crawler import Crawler
from crawler.database_writer import DatabaseWriter

# Set Python's recursion depth limit (default is 1000)
sys.setrecursionlimit(15499)

sites_to_crawl = "file://" + os.path.abspath("sites_to_crawl.html")

database_writer = DatabaseWriter('postgresql://localhost/eureka_development',
                                 10000)
crawler = Crawler(database_writer)
crawler.crawl(sites_to_crawl)
Beispiel #17
0
from urllib.request import urlopen
from posts.repository import Posts
from crawler.parser import Parser
from crawler.fetcher import Fetcher
from crawler.crawler import Crawler
import sys

if __name__ == "__main__":
    # Parse arguments
    nb_requested_posts = int(sys.argv[1]) if len(sys.argv) > 1 else 10
    library_file = sys.argv[2] if len(sys.argv) > 2 else "posts.pdl"

    # Crawl
    crawler = Crawler(Fetcher(), Parser())
    posts = crawler.crawl(nb_requested_posts)

    # Persist crawled posts
    repository = Posts(library_file, True)
    for post in posts:
        repository.addPost(post)

    print("%s posts have been parsed and saved to %s" % (repository.getPostsCount(), library_file))
Beispiel #18
0
class TestingCrawler(unittest.TestCase):
    def setUp(self):
        self.database_writer = MagicMock()
        self.database_reader = MagicMock()
        self.parser = MagicMock()
        self.database_reader.get_weburls_table_size = MagicMock(
            return_value=50)
        self.database_reader.get_weburls_and_content_table_size = MagicMock(
            return_value=10)
        self.database_reader.get_next_url = MagicMock(return_value=None)
        self.database_writer.database_limit = 10
        self.crawler = Crawler(self.database_writer, self.database_reader,
                               self.parser)
        self.local_index_html_file = "file://" + os.path.abspath(
            "test/website/index.html")
        self.crawler.crawl(self.local_index_html_file)

    def get_test_soup(self):
        test_soup = BeautifulSoup(
            '<!DOCTYPE html>\n<html>\n\n<head>\n <title>Cats and Dogs</title> \n<meta name="description" content="Page about cats and dogs"> \n <meta name="keywords" content="cats,dogs">\n</head><body><a href="www.dogs.com">Dogs</a><a href="www.cats.com">Cats</a></body></html>',
            'html.parser')
        return test_soup

    def test_crawler_is_instance_of_Crawler(self):
        self.assertIsInstance(self.crawler, Crawler)

    def test_crawl_calls_database_writer_write_url(self):
        self.database_writer.write_url = MagicMock()
        self.crawler.crawl(self.local_index_html_file)
        self.database_writer.write_url.assert_called_once_with(
            self.local_index_html_file)

    def test_crawl_accepts_and_assigns_url(self):
        self.assertEqual(self.crawler.url, self.local_index_html_file)

    def test_return_all_content_calls_database_writer_write_urls_and_content(
            self):
        self.crawler.database_writer.write_urls_and_content = MagicMock()
        self.crawler.return_all_content()
        self.crawler.database_writer.write_urls_and_content.assert_called_once(
        )

    def test_return_all_content_calls_crawl_next_url(self):
        self.crawler.crawl_next_url = MagicMock()
        self.crawler.return_all_content()
        self.crawler.crawl_next_url.assert_called_once()

    def test_return_all_content_calls_parser_create_soup_and_save_content(
            self):
        self.crawler.page = bytes()
        self.crawler.save_found_weburls = MagicMock()
        self.parser.create_soup_and_save_content = MagicMock()
        self.crawler.return_all_content()
        self.parser.create_soup_and_save_content.assert_called_once()

    def test_save_found_weburls_calls_database_writer_prepare_urls_for_writing_to_db(
            self):
        self.database_writer.prepare_urls_for_writing_to_db = MagicMock()
        self.crawler.save_found_weburls()
        test_urls_array = ["www.dogs.com", "www.cats.com"]
        self.database_writer.prepare_urls_for_writing_to_db.assert_called_once(
        )

    def test_crawl_next_url_calls_database_reader_get_next_url(self):
        self.crawler.crawl_next_url()
        self.database_reader.get_next_url.assert_called()
                      help="End page of searches results [2]")

    parser.add_option("-d",
                      "--download-searches",
                      dest="download_searches",
                      default="True",
                      help="Download search result pages (True/False) [True]")

    parser.add_option("-n",
                      "--download-offers",
                      dest="download_offers",
                      default="True",
                      help="Download offers (True/False) [True]")

    parser.add_option(
        "-r",
        "--remove-files",
        dest="remove_files",
        default="True",
        help="Empty dirs containing old files (True/False) [True]")

    opt, _ = parser.parse_args()

    crawler = Crawler(opt.city, opt.property_type)
    crawler.crawl(download_searches=opt.download_searches.lower() in "true",
                  download_offers=opt.download_offers.lower() in "true",
                  remove_files=opt.remove_files.lower() in "true",
                  start_page=opt.start_page,
                  end_page=opt.end_page,
                  rent=opt.offer_type)
Beispiel #20
0
import os

from crawler.crawler import Crawler

url = os.getenv('URL')
output_dir = os.getenv('OUTPUT_DIR')

crawler = Crawler(url, output_dir)

crawler.crawl()
Beispiel #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-u",
        "--url",
        help="Start URL that will be the first step of crawling",
        type=str,
        required=True)
    parser.add_argument("-d",
                        "--max_depth",
                        help="Max depth of crawling",
                        type=int,
                        default=2,
                        required=False)
    parser.add_argument("-ureg",
                        "--url_regex",
                        help="URL regex of the URLs you want to crawl",
                        type=str,
                        default=None,
                        required=False)
    parser.add_argument("-ua",
                        "--user_agent",
                        help="Max depth of crawling",
                        type=str,
                        required=False)
    parser.add_argument(
        "-op",
        "--pickle_path",
        help=
        "The path of a pickle object that stores a dict where each key is an URL and the values"
        "are a list of URLs found there.",
        type=str,
        required=False)
    parser.add_argument(
        "-csv",
        "--csv_path",
        help="The path of a csv file that the URLs found in each page.",
        type=str,
        required=False)

    args = parser.parse_args()

    if not validators.url(args.url):
        raise ValueError(
            f"The first argument must be an URL. It is {args.url} instead")

    if args.max_depth < 1:
        raise ValueError(
            f"Max depth argument must be greater or equal than 1. It is {args.max_depth} instead"
        )

    url_filter: Optional[UrlFilter] = None
    if args.url_regex:
        try:
            url_filter = UrlFilter(args.url_regex)
        except re.error:
            raise ValueError(
                f"URL regex argument is not a valid regex. It is {args.url_regex} instead"
            )

    if not args.pickle_path and not args.csv_path:
        raise ValueError(f"An output file path (pickle or CSV) is required.")

    crawl_delay = (1, 2)

    logger = make_logger()

    crawler = Crawler(start_url=args.url,
                      stop_condition=StopCondition.depth_is_reached(
                          args.max_depth),
                      url_filters=[url_filter],
                      user_agent=UserAgent.none(),
                      timeout=None,
                      logger=logger,
                      delay=Delay.uniform(*crawl_delay))

    crawl: Crawl = asyncio.run(crawler.crawl())

    if args.pickle_path:
        PickleWriter(crawl).write(args.pickle_path)

    if args.csv_path:
        CSVWriter(crawl).write(args.csv_path)
Beispiel #22
0
    elif test == "test_dom_family":
        dom_family("a.com", "b.a.com")
        dom_family("x.com", "x.com/login")
    elif test == "crawler":
        url = "http://forum.18.217.9.142.xip.io/login/"
        # url = "http://w.com"
        # url = "http://email.kumo.x10host.com"
        # url = "mizio.herokuapp.com/test"
        # url = "http://18.217.9.142.xip.io/"
        method = "bfs"
        agent = HTTP_UA_CHROME
        depth = 20
        pages = 100

        crawler = Crawler()
        crawler.crawl(url, method, agent, depth, pages)
    # configs
    elif test == "configs":
        config = configs.DEFAULT_CONFIGS
        for val in config:
            print("%s : %s" % (val, config[val]))

    # http_local
    elif test == "http_local":
        host = "localhost"
        port = 5000
        url = "/"
        ua = "chrome"

        # Test GET
        request = HttpRequest(host, port, "GET")