def process_item(self, item, spider): logger = spider.custom_logger if isinstance(item, PageItem): city_name = item['name'] page_content = item['page'].decode() # save record to database first try: info_dict = utils.append_extra_fileds( extractors.process_parsed_dict( extractors.parse_info_dict( extractors.extract_info(page_content) ) ) ) try: info_dict['city']['area_en'] = city_name create_status = models.create_city_record(info_dict) record_info = '{name} on {dtm}'.format(name=city_name, dtm=info_dict['update_dtm']) if create_status['success'] == 1: logger.info('Successfully save a record: {}'.format(record_info)) else: err_type = create_status['error_type'] if err_type == 'UniquenessError': logger.warn('Ignore duplicate record: {}'.format(record_info)) else: logger.error('Fail to save record: {record} because of {err_type}: {err_msg}'.format( record=record_info, err_type=err_type, err_msg=create_status['info'] )) except Exception as e: logger.error("Exception raised when saving record of city '{city}': {e}".format( city=city_name, e=repr(e) )) except Exception as e: logger.error("Exception raised when parsing page of city '{city}': {e}".format( city=city_name, e=repr(e) )) finally: # backup file file_name = '{}.html'.format(city_name) with open(os.path.join(spider.res_dir, file_name), 'wb') as f: f.write(item['page']) logger.info("Successfully backup the page of city '{}'".format(city_name))
def process_item(self, item, spider): logger = spider.custom_logger if isinstance(item, PageItem): city_name = item['name'] page_content = item['page'].decode() # save record to database first try: info_dict = utils.append_extra_fileds( extractors.process_parsed_dict( extractors.parse_info_dict( extractors.extract_info(page_content)))) try: info_dict['city']['area_en'] = city_name create_status = models.create_city_record(info_dict) record_info = '{name} on {dtm}'.format( name=city_name, dtm=info_dict['update_dtm']) if create_status['success'] == 1: logger.info('Successfully save a record: {}'.format( record_info)) else: err_type = create_status['error_type'] if err_type == 'UniquenessError': logger.warn('Ignore duplicate record: {}'.format( record_info)) else: logger.error( 'Fail to save record: {record} because of {err_type}: {err_msg}' .format(record=record_info, err_type=err_type, err_msg=create_status['info'])) except Exception as e: logger.error( "Exception raised when saving record of city '{city}': {e}" .format(city=city_name, e=repr(e))) except Exception as e: logger.error( "Exception raised when parsing page of city '{city}': {e}". format(city=city_name, e=repr(e))) finally: # backup file file_name = '{}.html'.format(city_name) with open(os.path.join(spider.res_dir, file_name), 'wb') as f: f.write(item['page']) logger.info("Successfully backup the page of city '{}'".format( city_name))
def handle(self, *args, **options): pages_dir = options['pages_dir'] client = None if options['ssh']: # Use sftp to access file logging_file = options['E'] if logging_file: logging_file = os.path.expanduser(logging_file) port = options['port'] # get hostname username = '' if pages_dir.find(':') > 0: hostname, pages_dir = pages_dir.split(':') if hostname.find('@') >= 0: username, hostname = hostname.split('@') else: hostname = input('Hostname: ') if len(hostname) == 0: raise CommandError('*** Hostname required.') # get username if username == '': username = getpass.getuser() # get password password = options['password'] # connect try: client = get_ssh_client(hostname, username, password, port, log_file=logging_file) except ssh_exception.PasswordRequiredException: raise CommandError('Password is required.') except ssh_exception.AuthenticationException: raise CommandError('Authentication failed.') except ssh_exception.ServerNotKnown: raise CommandError('Unknown server.') except ssh_exception.ConnectionTimeOut: raise CommandError('Connection timed out.') # Collect any error info for each file errors = [] # Collect update datetime for each successful creation classified by city success = {} try: with transaction.atomic(): for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options['city']): info_dict = utils.append_extra_fileds( extractors.process_parsed_dict( extractors.parse_info_dict( extractors.extract_info(content) ) ) ) name_en = re.search(r'([a-z]+)\.html', file_name).group(1) info_dict['city']['area_en'] = name_en rel_path = os.path.relpath(file_name, pages_dir) result = create_city_record(info_dict) if result['success'] == 0: del result['success'] result['file'] = rel_path result['info_dict'] = info_dict errors.append(result) else: if name_en not in success: success[name_en] = [] record = result['info'] success[name_en].append(record.update_dtm) except Exception as e: self.stderr.write('Exception raised from {}: {}'.format(file_name, repr(e))) raise e error_num = len(errors) success_num_by_city = dict(map(lambda item: (item[0], len(item[1])), success)) success_num = sum(success_num_by_city.values()) self.stdout.write('Successfully collect {} city records.'.format(success_num)) for city, num in success_num_by_city.items(): self.stdout.write('{}: {}'.format(city, num)) self.stdout.write('Fail to create {} city records.'.format(error_num)) for error in errors: self.stdout.write('file: {}, type: {}, info: {}'.format( error['file'], error['error_type'], error['info'] )) if error['error_type'] in {'ValidationError' or 'ValueError'}: self.stdout.write('{}'.format(pprint.pformat(error['info_dict'])))