Exemple #1
0
    def process_item(self, item, spider):
        logger = spider.custom_logger
        if isinstance(item, PageItem):
            city_name = item['name']
            page_content = item['page'].decode()
            # save record to database first
            try:
                info_dict = utils.append_extra_fileds(
                    extractors.process_parsed_dict(
                        extractors.parse_info_dict(
                            extractors.extract_info(page_content)
                        )
                    )
                )

                try:
                    info_dict['city']['area_en'] = city_name

                    create_status = models.create_city_record(info_dict)
                    record_info = '{name} on {dtm}'.format(name=city_name, dtm=info_dict['update_dtm'])
                    if create_status['success'] == 1:
                        logger.info('Successfully save a record: {}'.format(record_info))
                    else:
                        err_type = create_status['error_type']
                        if err_type == 'UniquenessError':
                            logger.warn('Ignore duplicate record: {}'.format(record_info))
                        else:
                            logger.error('Fail to save record: {record} because of {err_type}: {err_msg}'.format(
                                record=record_info,
                                err_type=err_type,
                                err_msg=create_status['info']
                            ))
                except Exception as e:
                    logger.error("Exception raised when saving record of city '{city}': {e}".format(
                        city=city_name,
                        e=repr(e)
                    ))

            except Exception as e:
                logger.error("Exception raised when parsing page of city '{city}': {e}".format(
                    city=city_name,
                    e=repr(e)
                ))
            finally:
                # backup file
                file_name = '{}.html'.format(city_name)
                with open(os.path.join(spider.res_dir, file_name), 'wb') as f:
                    f.write(item['page'])
                logger.info("Successfully backup the page of city '{}'".format(city_name))
Exemple #2
0
    def process_item(self, item, spider):
        logger = spider.custom_logger
        if isinstance(item, PageItem):
            city_name = item['name']
            page_content = item['page'].decode()
            # save record to database first
            try:
                info_dict = utils.append_extra_fileds(
                    extractors.process_parsed_dict(
                        extractors.parse_info_dict(
                            extractors.extract_info(page_content))))

                try:
                    info_dict['city']['area_en'] = city_name

                    create_status = models.create_city_record(info_dict)
                    record_info = '{name} on {dtm}'.format(
                        name=city_name, dtm=info_dict['update_dtm'])
                    if create_status['success'] == 1:
                        logger.info('Successfully save a record: {}'.format(
                            record_info))
                    else:
                        err_type = create_status['error_type']
                        if err_type == 'UniquenessError':
                            logger.warn('Ignore duplicate record: {}'.format(
                                record_info))
                        else:
                            logger.error(
                                'Fail to save record: {record} because of {err_type}: {err_msg}'
                                .format(record=record_info,
                                        err_type=err_type,
                                        err_msg=create_status['info']))
                except Exception as e:
                    logger.error(
                        "Exception raised when saving record of city '{city}': {e}"
                        .format(city=city_name, e=repr(e)))

            except Exception as e:
                logger.error(
                    "Exception raised when parsing page of city '{city}': {e}".
                    format(city=city_name, e=repr(e)))
            finally:
                # backup file
                file_name = '{}.html'.format(city_name)
                with open(os.path.join(spider.res_dir, file_name), 'wb') as f:
                    f.write(item['page'])
                logger.info("Successfully backup the page of city '{}'".format(
                    city_name))
    def handle(self, *args, **options):
        pages_dir = options['pages_dir']
        client = None

        if options['ssh']:
            # Use sftp to access file
            logging_file = options['E']
            if logging_file:
                logging_file = os.path.expanduser(logging_file)

            port = options['port']
            # get hostname
            username = ''
            if pages_dir.find(':') > 0:
                hostname, pages_dir = pages_dir.split(':')
                if hostname.find('@') >= 0:
                    username, hostname = hostname.split('@')
            else:
                hostname = input('Hostname: ')
            if len(hostname) == 0:
                raise CommandError('*** Hostname required.')

            # get username
            if username == '':
                username = getpass.getuser()

            # get password
            password = options['password']

            # connect
            try:
                client = get_ssh_client(hostname, username, password, port, log_file=logging_file)
            except ssh_exception.PasswordRequiredException:
                raise CommandError('Password is required.')
            except ssh_exception.AuthenticationException:
                raise CommandError('Authentication failed.')
            except ssh_exception.ServerNotKnown:
                raise CommandError('Unknown server.')
            except ssh_exception.ConnectionTimeOut:
                raise CommandError('Connection timed out.')

        # Collect any error info for each file
        errors = []
        # Collect update datetime for each successful creation classified by city
        success = {}
        try:
            with transaction.atomic():
                for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options['city']):
                    info_dict = utils.append_extra_fileds(
                        extractors.process_parsed_dict(
                            extractors.parse_info_dict(
                                extractors.extract_info(content)
                            )
                        )
                    )
                    name_en = re.search(r'([a-z]+)\.html', file_name).group(1)
                    info_dict['city']['area_en'] = name_en

                    rel_path = os.path.relpath(file_name, pages_dir)

                    result = create_city_record(info_dict)
                    if result['success'] == 0:
                        del result['success']
                        result['file'] = rel_path
                        result['info_dict'] = info_dict
                        errors.append(result)
                    else:
                        if name_en not in success:
                            success[name_en] = []

                        record = result['info']
                        success[name_en].append(record.update_dtm)
        except Exception as e:
            self.stderr.write('Exception raised from {}: {}'.format(file_name, repr(e)))
            raise e

        error_num = len(errors)
        success_num_by_city = dict(map(lambda item: (item[0], len(item[1])), success))
        success_num = sum(success_num_by_city.values())

        self.stdout.write('Successfully collect {} city records.'.format(success_num))
        for city, num in success_num_by_city.items():
            self.stdout.write('{}: {}'.format(city, num))

        self.stdout.write('Fail to create {} city records.'.format(error_num))
        for error in errors:
            self.stdout.write('file: {}, type: {}, info: {}'.format(
                error['file'], error['error_type'], error['info']
            ))
            if error['error_type'] in {'ValidationError' or 'ValueError'}:
                self.stdout.write('{}'.format(pprint.pformat(error['info_dict'])))