Python Crawler.closeの例

プログラミング言語: Python

名前空間/パッケージ名: crawler.crawler

クラス/型: Crawler

メソッド/関数: close

hotexamples.comのコード掲載数: 7

Python Crawler.close - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcrawler.crawler.Crawler.closeの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Crawler(30)

crawl(20)

close(7)

run(6)

start(5)

load_and_tokenize(3)

__init__(3)

attempt_login(2)

stop(2)

fetch_stock_data(2)

scrape(2)

max_depth(1)

retrieve_user_comments(1)

raw_report(1)

report(1)

retrieve_information(1)

retrieve_total_user_comments_score(1)

retrieve_user_avg_karma(1)

progress_bar(1)

scan(1)

retrieve_user_posts(1)

return_all_content(1)

save_found_weburls(1)

launch(1)

scrape_links(1)

search(1)

soupify(1)

start_bfs(1)

start_dfs(1)

start_poll(1)

steps_count(1)

learn(1)

get_tag_by_id(1)

get_user_by_post_id(1)

getMostFrequentWords(1)

add_rules(1)

add_seeds(1)

build_post_data(1)

crawl_dest(1)

crawl_files(1)

crawl_next_url(1)

create_remote_dir(1)

documents(1)

fetch_case_detail_link(1)

fill_disallow_urls(1)

find_all_urls(1)

getText(1)

get_url(1)

get_8k_form(1)

get_dependency_list(1)

コード例 #1

ファイルを表示

 def test_fill_disallow_urls_from_robot(self):
     with patch.object(requests, 'get') as mock_get:
         with open('fake_robots.txt', 'r') as fake_robots_txt:
             mock_get.return_value = FakeResponse()
             mock_get.return_value.text = fake_robots_txt.read()
             test_crawler = Crawler(
                 'https://a/',
                 [''], {})
             test_crawler.fill_disallow_urls(URL('https://a/'))
             test_crawler.close()
             self.assertEqual({re.compile('https://a/b.+', re.IGNORECASE)},
                              test_crawler.disallow_urls)

コード例 #2

ファイルを表示

 def test_searcher_with_seen_urls(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \
                                      '<a href=https://scala11.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['scala'], {}, 2)
             test_crawler.seen_urls.add(Page(URL('http://scala-lang.org')))
             test_result = test_crawler.crawl()
             test_crawler.close()
             assert 'http://scala-lang.org' not in test_result

コード例 #3

ファイルを表示

 def test_crawler_zero_result(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=https://scala1.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['dog'],
                 {},
                 2)
             test_result = test_crawler.crawl()
             test_crawler.close()
             self.assertEqual(test_result, set())

コード例 #4

ファイルを表示

 def test_update_parents(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=http://a/c/></a>' \
                                      '<a href=http://a/b/></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
         test_crawler = Crawler(
             'http://a',
             [''], {}, max_urls_count=3)
         test_result = test_crawler.crawl()
         test_crawler.close()
         for page in test_result:
             if page.parent:
                 self.assertEqual(page.parent,
                                  Page(URL('http://a')))

コード例 #5

ファイルを表示

def crawler_sqlmap(entry_url,
                   depth=-1,
                   level=1,
                   threads=2,
                   timeout=30,
                   checkhost=True):
    """启动sqlmap扫描的入口函数。

    :param entry_url: 扫描网站的入口地址
    :param depth: 网页爬虫爬取页面深度，－1则表示不设置深度，默认－1
    :param level: sqlmap扫描测试等级：1-5（默认为1），等级越高使用的测试样例越多，结果越精确，时间也越长
    :param threads: sqlmap多线程扫描设置（默认为2）
    :param timeout: sqlmap扫描超时时间（默认30s）
    :param checkhost: 检查爬取链接是否属于同一域
    :return: 返回值为四元组（ret, url, simple, content）
            ret: 执行结果, False为失败, True为成功
            url: 扫描目标地址
            simple: 解析content抽取重要数据生成的报告，字典类型
            content: sqlmap返回的完整报告，字典类型
            若执行结果为False，那么把扫描错误信息存在扫描关键结果（simple）这个位置
    """
    settings = Setting(handle=False)
    settings.depth = depth
    settings.nocheckhost = not checkhost
    settings.level = level
    settings.threads = threads
    settings.timeout = timeout

    sqlmap, crawler = None, None
    try:
        sqlmap, ip, port = start_sqlmap()
        # crawler的创建必须在sqlmap启动之后, 才能正确获取sqlmap的端口号
        crawler = Crawler(BASE_DIR, ip, port, entry_url, setting=settings)
        crawler.run()
        cont, simple = crawler.raw_report()
        return True, entry_url, simple, cont
    except:
        logger.error(traceback.format_exc())
        return False, entry_url, traceback.format_exc(), {}
    finally:
        if crawler: crawler.close()
        if sqlmap: sqlmap.terminate()

コード例 #6

ファイルを表示

 def test_searcher_with_result(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=https://scala1.html></a>' \
                                      '<a href=https://scala2.html></a>' \
                                      '<a href=https://scala3.html></a>' \
                                      '<a href=https://scala4.html></a>' \
                                      '<a href=https://scala5.html></a>' \
                                      '<a href=https://scala6.html></a>' \
                                      '<a href=https://scala7.html></a>' \
                                      '<a href=https://scala8.html></a>' \
                                      '<a href=https://scala9.html></a>' \
                                      '<a href=https://scala10.html></a>' \
                                      '<a href=https://scala11.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['scala'], {})
             test_result = test_crawler.crawl()
             test_crawler.close()
             self.assertEqual(len(test_result), 10)

コード例 #7

ファイルを表示

                                  default='pages',
                                  help='Directory for downloaded pages')
    arguments_parser.add_argument('-g', action='store_true', help='Show graph')
    arguments_parser.add_argument('-w',
                                  action='store_true',
                                  help='Save founded pages')
    args = arguments_parser.parse_args()
    white_domains = []
    for domain in args.wildcard:
        if domain.startswith('*'):
            white_domains.append(re.compile(fr'[^.]+.{domain[1::]}'))
        else:
            white_domains.append(domain)
    if args.start_url[-1] == '/':
        url = args.start_url[:-1]
    else:
        url = args.start_url
    crawler = Crawler(url, args.request, white_domains, args.d, args.f, args.w)
    try:
        result = crawler.crawl()
        if args.g:
            show_graph(result)
        for link in result:
            print(link)
        print('Program is completed')
        plt.show()
    except KeyboardInterrupt:
        print('Program is completed')
    finally:
        crawler.close()