Python Spider.content_list Beispiele

Programmiersprache: Python

Namespace / Paketname: spider

Klasse / Typ: Spider

Methode / Funktion: content_list

Beispiele auf hotexamples.com: 2

Python Spider.content_list - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die spider.Spider.content_list, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Spider(30)

crawl_page(30)

crawl(14)

__init__(8)

craw(4)

Search(4)

crawl_genre(3)

build_node(3)

analyse(3)

process_page(2)

court(2)

add_url(2)

content_list(2)

GetInfo(2)

crowl(1)

crowl_page(1)

GET(1)

crawled_page(1)

createResultExcel(1)

get2l_url(1)

crawledPage(1)

crawle_page_in_queue(1)

crawl_weather(1)

crawl_video_urls(1)

crawl_robots(1)

data(1)

getfilename(1)

get3l_url(1)

post(1)

update(1)

startCrawl(1)

setworkdir(1)

setfilename(1)

setDaemon(1)

responseCallback(1)

parse_blog(1)

getSoup(1)

linkCallback(1)

levelCallback(1)

is_valid(1)

is_outgoing(1)

htmlCallback(1)

get_pdfs(1)

crawl_page_graph(1)

crawl_async_slots(1)

crawl_next_page_from_queue(1)

authorized(1)

Process(1)

ReturnValues(1)

Text(1)

Beispiel #1

Datei anzeigen

Datei: test_unit.py Projekt: foreverhui/wenshu_spider

def test_spider():
    from session import Session
    from condition import Condition
    from spider import Spider
    from parameter import Parameter
    from datetime import datetime

    s = Session()
    c = Condition()
    # parameter = Parameter(param=str(c), sess=s)
    spider = Spider(sess=s)
    # page: 每页几条; order: 排序标准; direction: 顺序 (asc - 正序 desc - 倒序)
    print(
        spider.tree_content(param=Parameter(param=str(
            c.district('西藏自治区').date(datetime(1991, 1, 1), datetime(
                2018, 9, 15))),
                                            sess=s)))
    for i in spider.content_list(param=Parameter(param=str(
            c.district('西藏自治区').date(datetime(1991, 1, 1),
                                     datetime(2018, 9, 15))),
                                                 sess=s),
                                 page=20,
                                 order='法院层级',
                                 direction='asc'):
        print(i)

Beispiel #2

Datei anzeigen

def crawl_by_district():
    # Read config
    start_dist, start_date, start_court = None, None, None
    start_info = Config.start
    if hasattr(start_info, 'district') and start_info.district is not None:
        start_dist = start_info.district
        logging.info('Start District: {}'.format(start_dist))
    if hasattr(start_info, 'date') and start_info.date is not None:
        start_date = start_info.date
        logging.info('Start Date: {}'.format(start_date.strftime("%Y-%m-%d")))
    if hasattr(start_info, 'court') and start_info.court is not None:
        start_court = start_info.court
        logging.info('Start Court: {}'.format(start_court))

    max_retry = Config.config.max_retry
    data_file = open('./data/data {}.txt'.format(
        datetime.now().strftime('%Y-%m-%d %H-%M-%S')),
                     'a',
                     encoding='utf-8')

    s = Session()
    c = Condition()
    spider = Spider(sess=s)

    total_success = False
    while not total_success:
        try:
            if start_dist is not None:
                start = False
            else:
                start = True

            # log the distribution of district
            with open('district_list.txt', 'w', encoding='utf-8') as f:
                print(json.dumps(list(spider.district(condition=c)),
                                 ensure_ascii=False),
                      file=f)

            for dist in spider.district(condition=c):
                # Find the district to start
                if not start:
                    if dist == start_dist:
                        start = True
                    else:
                        continue
                logging.info(dist)
                c1 = c.district(dist)

                # If time_interval is interrupted, continue from the start_date
                cur_date = start_date
                start_date = None

                # Variables for retry
                dist_success = False
                dist_retry = max_retry
                while not dist_success:
                    try:
                        for time_interval in spider.time_interval(
                                condition=c1, start_date=cur_date):
                            logging.info('{0} {1} {2} {3}'.format(
                                dist, time_interval[0].strftime('%Y-%m-%d'),
                                time_interval[1].strftime('%Y-%m-%d'),
                                time_interval[2]))
                            cur_date = time_interval[0]
                            time_success = False
                            time_retry = max_retry
                            index = 1
                            c2 = c1.date(time_interval[0], time_interval[1])

                            cur_court = start_court
                            start_court = None

                            while not time_success:
                                if time_interval[2] > 200:
                                    try:
                                        for court in spider.court(
                                                condition=c2,
                                                district=dist,
                                                start_court=cur_court):
                                            logging.info(
                                                '{0} {1} {2} {3} {4} {5} {6}'.
                                                format(
                                                    dist,
                                                    time_interval[0].strftime(
                                                        '%Y-%m-%d'),
                                                    time_interval[1].strftime(
                                                        '%Y-%m-%d'), court[0],
                                                    court[1], court[2],
                                                    court[3]))
                                            if court[1] == 2:
                                                cur_court = court[0]
                                            court_success = False
                                            court_retry = max_retry
                                            index = 1
                                            c3 = c2.court(*court[0:3])
                                            while not court_success:
                                                try:
                                                    for item, idx in spider.content_list(
                                                            param=Parameter(
                                                                param=str(c3),
                                                                sess=s),
                                                            page=20,
                                                            order='法院层级',
                                                            direction='asc',
                                                            index=index):
                                                        print(item,
                                                              file=data_file)
                                                        index = idx
                                                    court_success = True
                                                except ErrorList as e:
                                                    logging.error(
                                                        'Error when fetch content list: {0}'
                                                        .format(str(e)))
                                                    court_retry -= 1
                                                    if court_retry <= 0:
                                                        s.switch_proxy()
                                                        court_retry = max_retry
                                        time_success = True
                                    except ErrorList as e:
                                        logging.error(
                                            'Error when fetch court: {0}'.
                                            format(str(e)))
                                        time_retry -= 1
                                        if time_retry <= 0:
                                            s.switch_proxy()
                                            time_retry = max_retry
                                else:
                                    try:
                                        for item, idx in spider.content_list(
                                                param=Parameter(param=str(c2),
                                                                sess=s),
                                                page=20,
                                                order='法院层级',
                                                direction='asc',
                                                index=index):
                                            print(item, file=data_file)
                                            index = idx
                                            # print(item['id'], item['name'])
                                            # try:
                                            #     spider.download_doc(item['id'])
                                            # except:
                                            #     print(item['id'], file=error_log)
                                        time_success = True
                                    except ErrorList as e:
                                        logging.error(
                                            'Error when fetch content list: {0}'
                                            .format(str(e)))
                                        time_retry -= 1
                                        if time_retry <= 0:
                                            s.switch_proxy()
                                            time_retry = max_retry
                        dist_success = True
                    except ErrorList as e:
                        logging.error(
                            'Error when fetch time interval: {0}'.format(
                                str(e)))
                        dist_retry -= 1
                        if dist_retry <= 0:
                            s.switch_proxy()
                            dist_retry = max_retry
            total_success = True
        except ErrorList as e:
            logging.error('Error when fetch dist information: {0}'.format(
                str(e)))
            s.switch_proxy()
    data_file.close()