def main(): _setup_logging() downloader = SeleniumHTMLDownloader( r'C:\Users\hananavr\Documents\לימודים\מעבדה בין תחומית\facebookCrawler copy\crawler\chromedriver.exe' ) # mongodb://<dbuser>:<dbpassword>@ds215633.mlab.com:15633/pytheas store = MongoItemStore(host='ds215633.mlab.com', port='15633', db='pytheas', article_collection='hanan', username='******', password='******') items_loader = MongoItemsLoader(host='ds215633.mlab.com', port='15633', db='pytheas', items_collection='hanan', username='******', password='******') crawler = Crawler(downloader, { 'www.facebook.com/': FacebookParser(), 'www.twitter.com/': TwitterParser() }, store, items_loader) crawler.crawl(FACEBOOK_PAGE_TO_DOWNLOAD_FROM)
def GET(): """ This method fetches the information needed to create the statistic of average comment karma by user from Reddit.com and returns it to the main application. :return: data needed to calculate the average comment karma by user """ logging.debug('GetKarmaStats method called') errors = [] try: chosen_username = web.input().chosen_username except Exception as e: errors.append('GetKarmaStats error:' + str(e)) logging.error('GetKarmaStats error:' + str(e)) my_crawler = Crawler(db) result = my_crawler.retrieve_user_avg_karma(chosen_username) del my_crawler if len(errors) == 0: return json.dumps({'result': result, 'code': 0}) else: return json.dumps({ 'result': result, 'code': len(errors), 'errors': list(errors) })
def test_init_positive02(self): my_crawler = Crawler() site_to_check = 'http://zabaykin.ru/?p=505' res = my_crawler.load_and_tokenize([site_to_check], depth=2) # print(res) # check number of parsed urls self.assertEqual(len(list(set(res.keys()))), 36)
def GET(): """ This method fetches the information needed to create the ranking of top users by submissions from Reddit.com and returns it to the main application. :return: data needed to create the Ranking of top users by comments score """ logging.debug('GetTopUsersCommentsScore method called') errors = [] try: chosen_subreddit = web.input().chosen_subreddit except Exception as e: errors.append('GetTopUsersCommentsScore error:' + str(e)) logging.error('GetTopUsersCommentsScore error:' + str(e)) my_crawler = Crawler(db) result = my_crawler.retrieve_total_user_comments_score( chosen_subreddit) del my_crawler if len(errors) == 0: return json.dumps({'result': result, 'code': 0}) else: return json.dumps({ 'result': result, 'code': len(errors), 'errors': list(errors) })
def GET(): """ This method fetches the information needed to create the list of submissions by user from Reddit.com and returns it to the main application. :return: data needed to create the list of submissions by user """ logging.debug('GetPostsByUser method called') errors = [] try: chosen_username = web.input().chosen_username except Exception as e: errors.append('GetPostsByUser error:' + str(e)) logging.error('GetPostsByUser error:' + str(e)) my_crawler = Crawler(db) result = my_crawler.retrieve_user_posts(chosen_username) del my_crawler if len(errors) == 0: return json.dumps({'result': result, 'code': 0}) else: return json.dumps({ 'result': result, 'code': len(errors), 'errors': list(errors) })
def setUp(self): self.mock_handler = Mock() self.mock_http = AsyncMock() self.mock_robots_agent = Mock() self.crawler = Crawler( self.mock_robots_agent, self.mock_http, self.mock_handler )
def _main(option, config): login_args = '{student} {password} {semester}'.format(**config['account']) cookie = loginceiba.info(*login_args.split()) crawler = Crawler(cookie) uploader = Uploader(crawler, option.file) course_cache = os.path.join(os.path.dirname(__file__), 'courses.json') with open(course_cache, 'r') as courses_data: courses = json.load(courses_data) for course in map(parse.parse_course, courses): if course['作業區'] is None: continue get_links = lambda hw_url: [ course['課程資訊']['課程網址'], # If ignored: 操作錯誤 'https://ceiba.ntu.edu.tw/modules/hw/hw.php', # If ignored: 您非修課學生 'https://ceiba.ntu.edu.tw/modules/hw/' + hw_url ] name = course['課程資訊']['課程名稱'] hws = [tuple(hw['名稱'].items())[0] for hw in course['作業區']] hws = [(hw[0], get_links(hw[1])) for hw in hws] uploader.add_page(UploadPage(name, hws)) uploader.start() uploader.cleanup()
async def _main(site, concurrent, minimum_wait, maximum_wait, max_items): start_time = time.time() robot_agent = create_robot_agent(USER_AGENT, site) emitter = Emitter(max_items) handler = KolonialHandlers(site, emitter) http_client = HttpClient(USER_AGENT, minimum_wait, maximum_wait) crawler = Crawler(robot_agent, http_client, handler) queue = asyncio.Queue() queue.put_nowait("/") workers = [ asyncio.create_task(worker_task(n, crawler, queue)) for n in range(concurrent) ] await queue.join() for worker in workers: worker.cancel() await asyncio.gather(*workers, return_exceptions=True) await http_client.close() end_time = time.time() print(f"{http_client.gets} requests") print(f"{emitter.count} entries") print(f"took {end_time - start_time}s")
def test_init_positive01(self): my_crawler = Crawler() site_to_check = 'http://zabaykin.ru' res = my_crawler.load_and_tokenize([site_to_check], depth=1) # print(res) # check number of paragraphs self.assertEqual(len(res[site_to_check]), 244)
def crawl(url, output_dir, depth=2, method="normal", gecko_path="geckodriver", page_name=None, custom_stats_handler=None, custom_process_handler=None): head_handlers = {} get_handlers = {} # get name of page for sub-directories etc. if not custom name given if page_name is None: page_name = urlparse(url).netloc get_handlers['application/pdf'] = LocalStoragePDFHandler( directory=output_dir, subdirectory=page_name) if custom_stats_handler is None: head_handlers['application/pdf'] = CSVStatsPDFHandler(directory=output_dir, name=page_name) else: for content_type, Handler in custom_stats_handler.items(): head_handlers[content_type] = Handler if custom_process_handler is None: process_handler = ProcessHandler() else: process_handler = custom_process_handler if not get_handlers and not head_handlers: raise ValueError('You did not specify any output') crawler = Crawler( downloader=requests_downloader, head_handlers=head_handlers, get_handlers=get_handlers, follow_foreign_hosts=False, crawl_method=method, gecko_path=gecko_path, process_handler=process_handler ) crawler.crawl(url, depth)
def crawl_start(): if db.get_status() == 1: return util.response(1, 'crawler is running!') else: c = Crawler() t = threading.Thread(target=c.run) t.start() return util.response(0, 'started')
def main(): c = Crawler() indices = c.get_master_indices() fs = FileSaver() fs.save_files(indices)
def main(): _setup_logging() downloader = SeleniumHTMLDownloader('./lib/chromedriver.exe') store = MongoArticleStore("localhost", 27017, "Crawler", "Articles") crawler = Crawler(downloader, { 'ynet.co.il': YnetParser() }, store) crawler.crawl('https://www.ynet.co.il')
def test_init(): try: crawler = Crawler(get_logger()) assert not crawler, "crawler is abstract and should be allowed to be instantiated" except TypeError as type_err: assert type_err, "TypeError is expected; crawler is abstract" except Exception as err: assert not err, "Exception should be of type TypeError"
def main(): with open('database.conf', 'r') as confFile: confStr = confFile.read() conf = json.JSONDecoder().decode(confStr) db.init_url(url=conf['mariadb_url']) crawler1 = Crawler(1000, url='https://www.szlcsc.com/brand.html') crawler2 = Crawler(1000, url='https://www.szlcsc.com/brand.html') # crawler.run(url='https://item.szlcsc.com/44085.html') # crawler.run(url='https://www.szlcsc.com/catalog.html') thread_one = threading.Thread(target=crawler1.run) thread_two = threading.Thread(target=crawler2.run) thread_one.start() thread_two.start() thread_one.join() thread_two.join() print('start')
def test_structure_positive_01(self): my_crawler = Crawler() site_to_check = 'http://zabaykin.ru/?p=505' res = my_crawler.load_and_tokenize([site_to_check], depth=2) self.assertIsInstance(res, OrderedDict) for k, el in res.items(): self.assertIsInstance(k, str) self.assertTrue(k.startswith('http')) self.assertIsInstance(el, List[str])
def __init__(self): try: self.crawler = Crawler() self.pgrpah = Graph() self.parser = Parser() pass except Exception as e: print("ERROR " + str(e)) sys.exit(-1)
def test_find_all_urls(self): c = Crawler('http://www.hotline.ua') expected_urls = [ 'http://www.hotline.ua/help/hl_checkout/h1/', 'http://www.hotline.ua/help/hl_checkout/', 'http://www.hotline.ua/help/hl_checkout/h3/', 'http://www.hotline.ua/help/hl_checkout/h2/' ] self.assertEqual(sorted(c.find_all_urls(self.page)), sorted(expected_urls))
def main(): parser = Parser(HTML.REDIS, PARSER.ELASTICSEARCH) """Crawler start""" crawler = Crawler(DB.MYSQL, HTML.REDIS, parser) try: crawler.run() except KeyboardInterrupt: crawler.stop() sys.exit(0)
def correct_document(): while True: Session = sessionmaker(bind=DB_ENGINE) session = Session() keyword_manager = KeywordManager() search_keyword = keyword_manager.pull_candidate_keyword() print 'Searchkeyword = ' + search_keyword category = session.query(Category).filter_by(category_name=CONFIG.CATEGORY).first() knowledge = session.query(Knowledge).filter_by(knowledge_name=search_keyword).first() if knowledge is None: knowledge = Knowledge(category.id, search_keyword) session.add(knowledge) session.commit() knowledge = session.query(Knowledge).filter_by(knowledge_name=search_keyword).first() session.close() inside_document_corrector = InsideDocumentCollector(search_keyword) print '\ncollecting from crawled_knowledge_docuemnts..' correct_document_count = inside_document_corrector.collect_document_from_crawled_knowledge_document(knowledge) print 'finish. collected '+str(correct_document_count) print '\ncollecting from without_knowledge_docuemnts..' correct_document_count += inside_document_corrector.collect_document_from_without_knowledge_document(knowledge) print 'finish. collected '+str(correct_document_count) print '\ncollecting from web..' correct_document_count += Crawler().collect_document_from_web( search_keyword, knowledge) print 'finish. collected '+str(correct_document_count) if correct_document_count < 1: Session = sessionmaker(bind=DB_ENGINE) session = Session() session.delete(knowledge) session.commit() session.close() continue print '\ncreating knowledge association..' # 지식관계설정 pathfinder = Pathfinder() pathfinder.find_path(knowledge) print 'done.\n' keyword_manager.delete_current_search_keyword(search_keyword) #call garbage collector gc.enable()
def main(): configs = load_configs() conf_db = configs["database"] conf_param = configs["params"] crawler = Crawler() db = MongoDb(conf_db["address"], conf_db["port"]) for stock_id in conf_param["stock_ids"]: data = crawler.fetch_stock_data(stock_id, conf_param["period"]) db.append_one(data)
def launch(dir_for_docs, dir_checkpoints, checkpoints_name, description_file, lock, inv_index, frontier, documents, step_count): crawler = Crawler(frontier, dir_for_docs, dir_checkpoints, checkpoints_name, lock, inv_index, description_file) if documents is None: open(crawler.file_description, 'w').close() # Wipe file else: crawler.documents = documents if step_count is not None: crawler.steps_count = step_count crawler.run()
def stopCrawler(request): try: id = request.POST.get('id') source = Source.objects.get(id=id) sourceurl = source.url crawlerone = Crawler(sourceurl) crawlerone.stop() runingcrawlers.update( {'id':id,'inst':crawlerone} ) return redirect('dashboard') except ObjectDoesNotExist: return redirect('dashboard')
def test_fill_disallow_urls_from_robot(self): with patch.object(requests, 'get') as mock_get: with open('fake_robots.txt', 'r') as fake_robots_txt: mock_get.return_value = FakeResponse() mock_get.return_value.text = fake_robots_txt.read() test_crawler = Crawler( 'https://a/', [''], {}) test_crawler.fill_disallow_urls(URL('https://a/')) test_crawler.close() self.assertEqual({re.compile('https://a/b.+', re.IGNORECASE)}, test_crawler.disallow_urls)
def crawl(url, cfgs): click.secho("Crawler will begin on '{url}' with below settings:\n".format(url=url), fg='green') config = configs.load_config_section(config_section=cfgs) if config is None: print(f"Invalid config {cfgs}. Switching to DEFAULT.") config = configs.load_config_section(config_section='DEFAULT') else: print(f"Config set {cfgs} loaded.") click.echo() crawler = Crawler() print(f"Target URL = {url}") crawler.crawl(url, config['traversal'], config['user_agent'], int(config['max_depth']), int(config['max_total']))
def test(url, vector_input, sensitive_input, random, speed): """Uses provided vectors and input to test against target.""" # TODO(piper): pass files in to test. c = Crawler(url[0], auth=True) if vector_input: vectored = c.crawl([Gatherer()]) if sensitive_input: [print(line) for line in sensitive_input] # result = c.crawl([VectorGatherer()]) print("Finished testing...")
def test_searcher_with_seen_urls(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \ '<a href=https://scala11.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['scala'], {}, 2) test_crawler.seen_urls.add(Page(URL('http://scala-lang.org'))) test_result = test_crawler.crawl() test_crawler.close() assert 'http://scala-lang.org' not in test_result
def test_crawler_zero_result(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=https://scala1.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['dog'], {}, 2) test_result = test_crawler.crawl() test_crawler.close() self.assertEqual(test_result, set())
def start(self): for game_type, crawl_range in self.crawl_range().items(): crawler = Crawler(game_type) for date in pd.date_range(start=crawl_range["begin"], end=crawl_range["end"]): logging.debug(f"command: crawling {game_type} at {date}") crawler.run(self.format_date(date)) # random sleep time.sleep( abs( np.random.normal( self.config["commander"]["queryPeriod"])))
def main(): c = Crawler() fs = FileSaver() for i in range(EARLIEST, LATEST + 1): for j in range(Q1, Q4 + 1): print("\nStarting on year %s, quarter %s\n" % (str(i), str(j))) if i == 2016 and j == 4: pass else: f = open(BASE_MASTER_PATH.format(i, j), 'r') for line in f: if contains_8k(line): line = line.split('|') doc = c.get_8k_form(line[URL]) wait = random.randint(1, 30) print('Waiting %s seconds' % (str(wait))) time.sleep(int(wait)) print('Current time is %s' % strftime('%Y-%m-%d %H:%M:%S', gmtime())) if isinstance(line[DATE], str) and isinstance( line[COMPANY], str): form_date = line[DATE] string_array = line[COMPANY].split('/') form_company = "" for idx in range(len(string_array)): form_company += string_array[idx] else: form_date = line[DATE].decode('utf-8', 'ignore') form_company = line[COMPANY].decode( 'utf-8', 'ignore') print('Found form from company %s on date of %s' % (form_company, form_date)) fs.save_8k_file(i, j, doc, form_date, form_company)