Example #1
0
def download_imgs():
    with open(os.path.join(COMMON_FLAGS.json_dir, 'selected_keywords.json'),
              'r') as fp:
        keywords = json.load(fp)
    print("keywords:", type(keywords), len(keywords), type(keywords[0]))

    api_keys = {
        'flickr': ('3845aa5608781b176e74bedd2a653b78', '19192eb5251a4809')
    }  # replace XXX.. and YYY.. by your own keys
    # images_nbr = 10000 # number of images to fetch
    images_nbr = 200  # 200 * 200 = 40k

    ### Crawl and download images ###
    from web_crawler import WebCrawler
    crawler = WebCrawler(api_keys, mindate=mindate, maxdate=maxdate)

    # 1. Crawl the web and collect URLs:
    crawler.collect_links_from_web(keywords,
                                   images_nbr,
                                   remove_duplicated_links=True)

    # 2. (alernative to the previous line) Load URLs from a file instead of the web:
    #crawler.load_urls(download_folder + "/links.txt")
    #crawler.load_urls_from_json(download_folder + "/links.json")

    # 3. Save URLs to download them later (optional):
    # crawler.save_urls(os.path.join(download_folder, "links.txt"))
    crawler.save_urls_to_json(
        os.path.join(url_folder, "links-%s-%s.json" % (mindate, maxdate)))
Example #2
0
 def build(cls):
     command_print("Build started")
     cls.crawler = WebCrawler()
     # run method for index
     cls.crawler.scrape_index_pages()
     # run method for all country pages
     cls.crawler.scrape_country_pages()
     # run method for all continent pages
     cls.crawler.scrape_continent_pages()
     # create the index from memory
     cls.crawler.create_index_file()
     command_print("Build completed")
Example #3
0
 def test_crawl(self):
     """
     Tests crawl method
     The get_html method of the html_requester class is mocked to return the contents of html_test_data.html.
     This mocking allows for inputting test html data without having to host it online.
     """
     file_util = FileUtil()
     expected_result = file_util.get_file_contents("crawl_test_data.txt")
     web_crawler = WebCrawler()
     web_crawler.html_requester.get_html = lambda url: self.mock_get_html(
         url)
     actual_result = web_crawler.crawl("http://www.domain.com")
     self.assertEqual(expected_result, actual_result)
Example #4
0
 def __init__(self, target):
     super().__init__()
     self.crawler = WebCrawler()
     target_config = getConfig().get("targets", {}).get(target)
     self.logger = getLogger(self.__class__.__name__)
     if not target_config:
         self.logger.error("target is not found in config.")
         raise Exception("target is not found in config.")
     self.logger.info(f"Application is processing target {target}")
     self.target_config = target_config
     self.max_threads = int(getConfig()["configs"]["max_threads"])
     self.sleep_time = int(self.target_config["sleep"])
     self.detail_urls = []
     self.items = []
Example #5
0
def main(args):
    logging.basicConfig(filename='web_crawler_' + str(datetime.now()) + '.log',
                        filemode='w',
                        format='%(name)s - %(levelname)s - %(message)s',
                        level=logging.DEBUG)
    try:
        web_crawler_instance = WebCrawler(args[1])
        web_crawler_instance.crawl()
    except IndexError as ex:
        logging.error(
            f"An error has occurred whilst running the crawler, no URL was provided: {str(ex)}"
        )
    except DBConnectionError as ex:
        logging.error(
            f"An error has ocurred whilst connecting to DB: {str(ex)}")
    except Exception as ex:
        logging.error(
            f"An error has occurred whilst running the crawler: {str(ex)}")
    logging.info("Program finished running.")
Example #6
0
    def test_crawler(self):
        web_crawler = WebCrawler(url="http://localhost:5000",
                                 max_threads=32,
                                 max_pages=float("inf"))

        self.assertEqual(
            sorted(list(web_crawler.crawl())),
            sorted([
                "http://localhost:5000/com",
                "http://localhost:5000/test",
                "http://localhost:5000/test123",
            ]),
        )

        self.assertNotEqual(
            sorted(list(web_crawler.crawl())),
            sorted([
                "https://google.com", "http://localhost:5000/com", "/test",
                "/test123"
            ]),
        )
Example #7
0
def main():
    parser = ArgumentParser()
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-d", "--depth", type=int, help="limit crawling by depth of directory tree (default, 10)")
    group.add_argument("-c", "--count", type=int, help="limit crawling by number of pages")
    parser.add_argument("url_list", help="file containing urls separated by newlines")
    parser.add_argument("-v", "--verbose", action="store_true", help="set verbosity of program")
    parser.add_argument("-p", "--max-processes", type=int, help="maximum number of processes to run in parallel (default is 10)")
    parser.add_argument("-t", "--max-threads", type=int, help="maximum number of threads per process (default is 20)")
    args = parser.parse_args()

    # check if url_list file exists and that user has permission to read it
    if not os.path.isfile(args.url_list) or not os.access(args.url_list, os.R_OK):
        print("[-] File does not exist: {}".format(args.url_list))
        sys.exit(1)

    # get url list
    urls = list()
    with open(args.url_list, "r") as url_list_file:
        for url in url_list_file:
            urls.append(url.strip())

    crawler = WebCrawler(urls)

    # set custom parameters
    if args.max_processes:
        crawler.max_processes = args.max_processes
    if args.max_threads:
        crawler.max_threads = args.max_threads
    if args.verbose:
        crawler.verbose = True
    if args.depth:
        crawler.limit = "depth"
        crawler.limit_param = args.depth
    elif args.count:
        crawler.limit = "count"
        crawler.limit_param = args.count

    crawler.start()
    sys.exit(0)
Example #8
0
def web_crawler_main():
    """
        check user input and start WebCrawler
    """
    opts, args = get_args()
    logger = get_logger()

    url = add_valid_protocol_prefix(opts.url)
    depth_limit = opts.depth_limit if 0 < opts.depth_limit <= DEFAULT__DEPTH_LIMIT else None
    time_out = opts.time_out if 0 < opts.time_out else None

    if not url or not depth_limit or not time_out:
        if not url:
            logger.error("invalid page address")
        if not depth_limit:
            logger.error("invalid depth limit")
        if not time_out:
            logger.error("invalid time out")
        raise SystemExit(1)

    domain_name = get_sub_domain_name(url)
    web_crawler = WebCrawler(url, domain_name, depth_limit, time_out, logger)
    web_crawler.start()
Example #9
0
 def __init__(self):
     super().__init__()
     self.logger = getLogger(self.__class__.__name__)
     self.reviewUrl = getConfig()['task1']['review_url']
     self.web_crawler = WebCrawler()
Example #10
0
 def __init__(self):
     self.logger = getLogger(self.__class__.__name__)
     self.commentUrl = getConfig()['task2']['comment_url']
     self.web_crawler = WebCrawler()
     self.db_helper = DbHelper()
     self.__comments = []
 def __init__(self):
     self.web_crawler = WebCrawler()
Example #12
0
 def __init__(self, base_url):
     super().__init__()
     self.logger = getLogger(self.__class__.__name__)
     self.crawler = WebCrawler()
     self.base_url = base_url
"""on importe notre module web crawler qu'on a créé
   """
import sys
from web_crawler import WebCrawler

DATA_TYPE = sys.argv[1]
CRAWLING_ACTIVATED = True if sys.argv[2] == '1' else False

# On prend en entrée un URL
if DATA_TYPE == '1':
    print("Starting crawler on URL")
    if not CRAWLING_ACTIVATED:
        print("(Crawling deactivated)")
    STARTING_URL = sys.argv[3]
    CRAWLER = WebCrawler()
    CRAWLER.crawl_site(STARTING_URL, CRAWLING_ACTIVATED)
    CRAWLER.print_report()

# On prend en entrée un fichier local (crawling désactivé)
elif DATA_TYPE == '2':
    print("Starting crawler on local file")
    print("(Crawling deactivated)")
    CRAWLING_ACTIVATED = False
    LOCAL_FILE = sys.argv[3]
    CRAWLER = WebCrawler()
    CRAWLER.crawl_local_file(LOCAL_FILE)
    CRAWLER.print_report()

# On prend en entrée des données en stdin
elif DATA_TYPE == '3':
    print("What type of std:in do you want to use \
Example #14
0
    )
    parser.add_argument(
        "--max_pages",
        help="limit the maximum pages that the crawler can parse")
    parser.add_argument("-v",
                        "--verbose",
                        help="increase the verbose output",
                        type=bool,
                        default=False)

    args = parser.parse_args()

    return {
        "url":
        args.url if args.url else "https://www.scrapehero.com/",
        "max_threads":
        args.max_threads if args.max_threads else min(
            32,
            os.cpu_count() + 4),  # is the max workers default value in python
        "max_pages":
        args.max_pages if args.max_pages else float("inf"),
        "verbosity":
        args.verbose,
    }


if __name__ == "__main__":
    args = initialise_arguments()
    web_crawler = WebCrawler(**args)
    pprint(web_crawler.crawl())
Example #15
0
#!/usr/bin/env python

keywords = ["cats", "dogs", "birds"]
api_keys = {
    'google': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY'),
    'flickr': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY')
}
images_nbr = 10  # number of images to fetch
download_folder = "./data"  # folder in which the images will be stored

### Crawl and download images ###
from web_crawler import WebCrawler

crawler = WebCrawler(api_keys)

# 1. Crawl the web and collect URLs:
crawler.collect_links_from_web(keywords,
                               images_nbr,
                               remove_duplicated_links=True)

# 2. (alernative to the previous line) Load URLs from a file instead of the web:
#crawler.load_urls(download_folder + "/links.txt")
#crawler.load_urls_from_json(download_folder + "/links.json")

# 3. Save URLs to download them later (optional):
crawler.save_urls(download_folder + "/links.txt")
#crawler.save_urls_to_json(download_folder + "/links.json")

# 4. Download the images:
crawler.download_images(target_folder=download_folder)
Example #16
0
from web_crawler import WebCrawler
from worker import Worker

config = ConfigParser.RawConfigParser()
config.read('/etc/calfresh/calfresh.conf')

logger = logging.getLogger('root')

if __name__ == '__main__':

    logger.info('starting...')

    datapath = None
    for table in table_url_map.keys():
        try:
            crawler = WebCrawler(table, table_url_map[table])
            new_table_data = crawler.crawl()

            if new_table_data:
                worker = Worker(new_table_data)
                datapath = worker.work()

        except Exception as ex:
            logger.exception(ex)

    if datapath:
        loader = DataLoader()
        loader.load(datapath)

    crawler.clean_up()
    logger.info('finished')
Example #17
0
 def __init__(self):
     self._crawler = WebCrawler()
     self._parser = None
Example #18
0
 def setUp(self):
     self.crawler = WebCrawler(
         table='tbl_dfa256',
         url=table_url_map['tbl_dfa256'],
     )
Example #19
0
def main():
    crawl = WebCrawler(input())
    crawl.spider()
Example #20
0
 def __init__(self):
     self.inverted_index = InvertedIndex()
     self.crawler = WebCrawler()