def test_spider():
    from session import Session
    from condition import Condition
    from spider import Spider
    from parameter import Parameter
    from datetime import datetime

    s = Session()
    c = Condition()
    # parameter = Parameter(param=str(c), sess=s)
    spider = Spider(sess=s)
    # page: 每页几条; order: 排序标准; direction: 顺序 (asc - 正序 desc - 倒序)
    print(
        spider.tree_content(param=Parameter(param=str(
            c.district('西藏自治区').date(datetime(1991, 1, 1), datetime(
                2018, 9, 15))),
                                            sess=s)))
    for i in spider.content_list(param=Parameter(param=str(
            c.district('西藏自治区').date(datetime(1991, 1, 1),
                                     datetime(2018, 9, 15))),
                                                 sess=s),
                                 page=20,
                                 order='法院层级',
                                 direction='asc'):
        print(i)
Exemple #2
0
    def court(self,
              condition: Condition,
              district: str,
              start_court: str = None):
        """
        :param condition:
        :param district:
        :param start_court: start_court will only be available for level 2
        :return: Court name, court level, court indicator, count
        """
        level_count = {'高级法院': 0, '中级法院': 0, '基层法院': 0}
        condition = condition.district(district)
        info = self.tree_content(
            Parameter(param=str(condition), sess=self.sess))['法院层级']
        satisfy = True
        for item in info['ParamList']:
            if item['IntValue'] > 200:
                satisfy = False
            if item['Key'] in level_count:
                level_count[item['Key']] = item['IntValue']

        if satisfy:
            for k, v in level_count.items():
                if v > 0:
                    yield None, k, True, v

        else:
            start = start_court is None

            if start and level_count['高级法院'] > 0:
                yield None, 1, True, level_count['高级法院']
            middle = self.court_tree_content(condition,
                                             parval=district)['中级法院']
            for d in sorted(middle['ParamList'],
                            key=lambda item: item['IntValue'],
                            reverse=False):
                mid_court = d['Key']
                if not start:
                    if mid_court == start_court:
                        start = True
                if start:
                    if 0 < d['IntValue'] < 200:
                        yield mid_court, 2, False, d['IntValue']
                    else:
                        base = self.court_tree_content(
                            condition.court(mid_court, 2, False),
                            parval=mid_court)['基层法院']
                        if d['IntValue'] - base['IntValue'] > 0:
                            yield mid_court, 2, True, d['IntValue'] - base[
                                'IntValue']
                        for g in sorted(base['ParamList'],
                                        key=lambda item: item['IntValue'],
                                        reverse=False):
                            base_court = g['Key']
                            if g['IntValue'] > 0:
                                yield base_court, 3, False, g['IntValue']
Exemple #3
0
def crawl_by_district():
    # Read config
    start_dist, start_date, start_court = None, None, None
    start_info = Config.start
    if hasattr(start_info, 'district') and start_info.district is not None:
        start_dist = start_info.district
        logging.info('Start District: {}'.format(start_dist))
    if hasattr(start_info, 'date') and start_info.date is not None:
        start_date = start_info.date
        logging.info('Start Date: {}'.format(start_date.strftime("%Y-%m-%d")))
    if hasattr(start_info, 'court') and start_info.court is not None:
        start_court = start_info.court
        logging.info('Start Court: {}'.format(start_court))

    max_retry = Config.config.max_retry
    data_file = open('./data/data {}.txt'.format(
        datetime.now().strftime('%Y-%m-%d %H-%M-%S')),
                     'a',
                     encoding='utf-8')

    s = Session()
    c = Condition()
    spider = Spider(sess=s)

    total_success = False
    while not total_success:
        try:
            if start_dist is not None:
                start = False
            else:
                start = True

            # log the distribution of district
            with open('district_list.txt', 'w', encoding='utf-8') as f:
                print(json.dumps(list(spider.district(condition=c)),
                                 ensure_ascii=False),
                      file=f)

            for dist in spider.district(condition=c):
                # Find the district to start
                if not start:
                    if dist == start_dist:
                        start = True
                    else:
                        continue
                logging.info(dist)
                c1 = c.district(dist)

                # If time_interval is interrupted, continue from the start_date
                cur_date = start_date
                start_date = None

                # Variables for retry
                dist_success = False
                dist_retry = max_retry
                while not dist_success:
                    try:
                        for time_interval in spider.time_interval(
                                condition=c1, start_date=cur_date):
                            logging.info('{0} {1} {2} {3}'.format(
                                dist, time_interval[0].strftime('%Y-%m-%d'),
                                time_interval[1].strftime('%Y-%m-%d'),
                                time_interval[2]))
                            cur_date = time_interval[0]
                            time_success = False
                            time_retry = max_retry
                            index = 1
                            c2 = c1.date(time_interval[0], time_interval[1])

                            cur_court = start_court
                            start_court = None

                            while not time_success:
                                if time_interval[2] > 200:
                                    try:
                                        for court in spider.court(
                                                condition=c2,
                                                district=dist,
                                                start_court=cur_court):
                                            logging.info(
                                                '{0} {1} {2} {3} {4} {5} {6}'.
                                                format(
                                                    dist,
                                                    time_interval[0].strftime(
                                                        '%Y-%m-%d'),
                                                    time_interval[1].strftime(
                                                        '%Y-%m-%d'), court[0],
                                                    court[1], court[2],
                                                    court[3]))
                                            if court[1] == 2:
                                                cur_court = court[0]
                                            court_success = False
                                            court_retry = max_retry
                                            index = 1
                                            c3 = c2.court(*court[0:3])
                                            while not court_success:
                                                try:
                                                    for item, idx in spider.content_list(
                                                            param=Parameter(
                                                                param=str(c3),
                                                                sess=s),
                                                            page=20,
                                                            order='法院层级',
                                                            direction='asc',
                                                            index=index):
                                                        print(item,
                                                              file=data_file)
                                                        index = idx
                                                    court_success = True
                                                except ErrorList as e:
                                                    logging.error(
                                                        'Error when fetch content list: {0}'
                                                        .format(str(e)))
                                                    court_retry -= 1
                                                    if court_retry <= 0:
                                                        s.switch_proxy()
                                                        court_retry = max_retry
                                        time_success = True
                                    except ErrorList as e:
                                        logging.error(
                                            'Error when fetch court: {0}'.
                                            format(str(e)))
                                        time_retry -= 1
                                        if time_retry <= 0:
                                            s.switch_proxy()
                                            time_retry = max_retry
                                else:
                                    try:
                                        for item, idx in spider.content_list(
                                                param=Parameter(param=str(c2),
                                                                sess=s),
                                                page=20,
                                                order='法院层级',
                                                direction='asc',
                                                index=index):
                                            print(item, file=data_file)
                                            index = idx
                                            # print(item['id'], item['name'])
                                            # try:
                                            #     spider.download_doc(item['id'])
                                            # except:
                                            #     print(item['id'], file=error_log)
                                        time_success = True
                                    except ErrorList as e:
                                        logging.error(
                                            'Error when fetch content list: {0}'
                                            .format(str(e)))
                                        time_retry -= 1
                                        if time_retry <= 0:
                                            s.switch_proxy()
                                            time_retry = max_retry
                        dist_success = True
                    except ErrorList as e:
                        logging.error(
                            'Error when fetch time interval: {0}'.format(
                                str(e)))
                        dist_retry -= 1
                        if dist_retry <= 0:
                            s.switch_proxy()
                            dist_retry = max_retry
            total_success = True
        except ErrorList as e:
            logging.error('Error when fetch dist information: {0}'.format(
                str(e)))
            s.switch_proxy()
    data_file.close()