def _crawl(crawler: Crawler, args: argparse.Namespace) -> int: """Crawl using the provided crawler. Args: crawler: The crawler object. args: The command line arguments Returns: 0 on success, else 1 """ failure_occured = False try: crawler.crawl() _print_dead_links(crawler.dead_links) except CrawlerException as exception: logger.error(str(exception)) failure_occured = True except Exception as exception: failure_occured = True # Using Broad exception to catch all errors to give a proper error message logger.error("Error occured while crawling") if args.show_exception_tb: # To keep the output clean logger.exception(exception) return 1 if failure_occured else 0
def start(): crawler = Crawler() crawler_results = crawler.crawl() for crawler_result in crawler_results: attribute_string = crawler_result.get('attribute_string') attribute_usd_price = crawler_result.get('attribute_usd_price') attribute = crawler_result.get('attribute') converter = Converter() print attribute,converter.convert(attribute_usd_price,attribute_string)
class LinkMiner: data = None def __init__(self, sources: list, targets: list): self.crawler = Crawler(sources=sources, targets=targets) self.graph = Digraph(strict=True, engine='circo') self.graph.graph_attr['overlap'] = 'false' def extract(self): self.data = self.crawler.run() nodes = Counter(self.data['nodes']) top = max(nodes.values()) for node in nodes.keys(): self.graph.node( node, node, **{ 'size': str(max([nodes[node], int(top / 4)])), 'fontsize': str(max([nodes[node], int(top / 4)])) }) for edge in self.data['edges']: self.graph.edge(edge['source'], edge['target']) def render(self, filename='untitled'): self.graph.render(f'{filename}.gv', view=True) def export_json(self, filename): string = json.dumps(self.data['edges']) with open(f'{filename}.json', 'w') as file: file.write(string)
class ConverterTest(unittest.TestCase): def test_crawl(self): self.crawler = Crawler() self.assertIsInstance(self.crawler.crawl(), list) if __name__ == '__main__': unittest.main()
def create_app_test(): file_hndlrs = configure_logger() mediator_q = Queue() clients = [ Parser(Queue(), mediator_q, config), BotProtocol(Queue(), mediator_q, config), Crawler(Queue(), mediator_q, config, 10), CommandMessageHandler(Queue(), mediator_q, config), ] mediator = AppMediator(mediator_q, clients) mediator.start() for client in clients: client.start() reglament_thread = Thread(target=reglament_work, args=[mediator]) reglament_thread.start() try: while True: time.sleep(30) if not mediator.is_alive(): logger.error('Медиатор умер, пеерзапускаю...') mediator = AppMediator(mediator_q, mediator.clients) reglament_thread = Thread(target=reglament_work, args=[mediator]) reglament_thread.start() mediator.start() mediator.check_clients() finally: for file_hndl in file_hndlrs: file_hndl.close()
def run_crawler(): from src.const import NAME_URL_DICT from src.crawler import Crawler is_headless = args.headless for name in NAME_URL_DICT.keys(): print(f"==={name}===") Crawler(name, is_headless).run()
def main(argv): domain = '' try: opts, args = getopt.getopt(argv, "hd:", ["domain="]) except getopt.GetoptError: print('app.py -d <domain>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('app.py -d <domain>') sys.exit() elif opt in ("-d", "--domain"): domain = arg max_depth = 3 site_map_set = Crawler(domain, max_depth).crawl(0) logger.info('site map:') for endpoint in site_map_set: logger.info(endpoint)
def __init__(self): # Parse arguemnts if len(sys.argv) > 1: arg = sys.argv[1] else: arg = ask('Enter domain:') if not arg: exit() # Verify domain integrity if '://' in arg: parsed = urlsplit(arg) else: parsed = urlsplit('http://' + arg) if '.' not in parsed.netloc: pr('Invalid domain!', '!') exit() # Verify subdomain self.subdomain = self.base_domain = None pts = parsed.netloc.split('.') if len(pts) > 2: pr('Is this the subdomain you wish to use:? ' + pts[0]) if pause('agree', cancel=True): # subdomain self.subdomain = pts[0] self.base_domain = '.'.join(pts[1:]) if not self.subdomain: self.subdomain = 'www' if not self.base_domain: self.base_domain = parsed.netloc self.domain = parsed.netloc self.scheme = parsed.scheme if parsed.scheme else 'http' print() pr('Using domain: ' + fc + self.domain + fx) self.crawler = Crawler(self, parsed.path)
def extract_test(self): crawler = Crawler(self.test_url, 1) extracted_urls = crawler.extract_links(self.test_url) self.assertTrue(len(extracted_urls) > 0)
#!/usr/bin/env python import logging from src.crawler import Crawler if '__main__' == __name__: logging.basicConfig(level=logging.DEBUG) Crawler().crawl()
import argparse from src import settings from src.api import BildungsserverFeed, LocalXmlFeed, LocalRssFeed from src.crawler import Crawler, SiemensCrawler, BildungsserverCrawler from src.exceptions import ConfigurationError if __name__ == '__main__': if settings.CRAWLER.lower() == 'bildungsserver': Crawler = BildungsserverCrawler elif settings.CRAWLER.lower() == 'siemens-stiftung': Crawler = SiemensCrawler else: raise ConfigurationError("settings.CRAWLER must be set.") dry_run = settings.DRY_RUN crawler = Crawler(dry_run=dry_run) crawler.crawl()
import sys import os from termcolor import colored from src.crawler import Crawler, CrawlerRequest if len(sys.argv) < 2: print(colored('Please provide the input file as argument.', 'yellow'), colored('For eg:', 'yellow')) print(colored('$ python AMPCrawler.py /path/to/your/file.txt', 'green')) sys.exit(1) elif (not os.path.exists(sys.argv[1]) and not os.path.isfile(sys.argv[1])): print(colored('Make sure given file is on correct path', 'red'), colored('and file type is file', 'red')) sys.exit(1) try: file = sys.argv[1] ampCrawl = Crawler(file) ampCrawl.run_crawler() ampCrawl.show_result() sys.exit(ampCrawl.exit_code()) except Exception as e: print("Crawler script failing with error:\n%s" % (e)) sys.exit(1)
from src.crawler import Crawler from src.feeder import Feeder from warnings import filterwarnings import pymysql as pymysql filterwarnings('ignore', category = pymysql.Warning) parser = argparse.ArgumentParser(description="Download all Pokemon Showdown's stats files, and fill a database with its stats.") parser.add_argument("dbms", help="Database Management System", choices=["mysql"]) parser.add_argument("host", help="Database address") parser.add_argument("user", help="Database user") parser.add_argument("password", help="User password") parser.add_argument("dbname", help="Database name") group = parser.add_mutually_exclusive_group() group.add_argument("-p", "--only-parse", "--skip-download", help="do not download any file from the internet and only use available local files to build the database", action="store_true") # group.add_argument("-d", "--only-download", "--skip-parse", help="do not parse and do not store any file in a database, and only download files from the internet", action="store_true") parser.add_argument("-F", "--folder", help="folder to use to download files into, and to parse from") parser.add_argument("-f", "--file", help="only process a single specific file") parser.add_argument("-v", "--verbose", help="be verbose", action="store_true") args = parser.parse_args() # Phase 1 : Download print args if not args.only_parse: crawler = Crawler('') crawler.run() # Phase 2 : Parse feeder = Feeder('stats') feeder.feedAll(args.dbms, args.host, args.user, args.password, args.dbname)
def main(browser, url, config): crawler = Crawler(browser, FreidaConfig(browser)) crawler.scrape(url, 'freida_results_' + str(time.time())[:10] + '.csv')
def test_crawl(self): self.crawler = Crawler() self.assertIsInstance(self.crawler.crawl(), list)
def robots_test(self): crawler = Crawler(self.test_url, 1) self.assertTrue(self.robot_path and self.robot_url not in crawler.extract_links(self.test_url))
from src.crawler import Crawler import time TIME_LIMIT = 900 # time in seconds, after which the crawler is forcibly stopped. if __name__ == '__main__': print('Running crawler...') crawler = Crawler('airbnb') crawler.start() print('Thread started, Ctrl-c to stop early.') try: time.sleep(TIME_LIMIT) except KeyboardInterrupt: print("** Killing crawler") else: print("** Times up, ending crawl.") finally: crawler.kill() crawler.wait_for_child() print('** Crawler finished')
def __init__(self, sources: list, targets: list): self.crawler = Crawler(sources=sources, targets=targets) self.graph = Digraph(strict=True, engine='circo') self.graph.graph_attr['overlap'] = 'false'
from urlparse import urlparse from src import std from src import scanner from src import reverseip from src import serverinfo from src.web import search from src.crawler import Crawler # search engine instance bing = search.Bing() google = search.Google() yahoo = search.Yahoo() # crawler instance crawler = Crawler() def singlescan(url): """instance to scan single targeted domain""" if urlparse(url).query != '': result = scanner.scan([url]) if result != []: # scanner.scan print if vulnerable # therefore exit return result else: print "" # move carriage return to newline std.stdout("no SQL injection vulnerability found")
def scrape_yard(): crawler = AsynchronousCrawler() if settings.isASynchronous else Crawler() crawler.start() scraper = MultiprocessScraper() if settings.isMultiprocess else Scraper() scraper.start()