def process_item(self, item, spider): logger = spider.custom_logger if isinstance(item, PageItem): city_name = item['name'] page_content = item['page'].decode() # save record to database first try: info_dict = utils.append_extra_fileds( extractors.process_parsed_dict( extractors.parse_info_dict( extractors.extract_info(page_content) ) ) ) try: info_dict['city']['area_en'] = city_name create_status = models.create_city_record(info_dict) record_info = '{name} on {dtm}'.format(name=city_name, dtm=info_dict['update_dtm']) if create_status['success'] == 1: logger.info('Successfully save a record: {}'.format(record_info)) else: err_type = create_status['error_type'] if err_type == 'UniquenessError': logger.warn('Ignore duplicate record: {}'.format(record_info)) else: logger.error('Fail to save record: {record} because of {err_type}: {err_msg}'.format( record=record_info, err_type=err_type, err_msg=create_status['info'] )) except Exception as e: logger.error("Exception raised when saving record of city '{city}': {e}".format( city=city_name, e=repr(e) )) except Exception as e: logger.error("Exception raised when parsing page of city '{city}': {e}".format( city=city_name, e=repr(e) )) finally: # backup file file_name = '{}.html'.format(city_name) with open(os.path.join(spider.res_dir, file_name), 'wb') as f: f.write(item['page']) logger.info("Successfully backup the page of city '{}'".format(city_name))
def process_item(self, item, spider): logger = spider.custom_logger if isinstance(item, PageItem): city_name = item['name'] page_content = item['page'].decode() # save record to database first try: info_dict = utils.append_extra_fileds( extractors.process_parsed_dict( extractors.parse_info_dict( extractors.extract_info(page_content)))) try: info_dict['city']['area_en'] = city_name create_status = models.create_city_record(info_dict) record_info = '{name} on {dtm}'.format( name=city_name, dtm=info_dict['update_dtm']) if create_status['success'] == 1: logger.info('Successfully save a record: {}'.format( record_info)) else: err_type = create_status['error_type'] if err_type == 'UniquenessError': logger.warn('Ignore duplicate record: {}'.format( record_info)) else: logger.error( 'Fail to save record: {record} because of {err_type}: {err_msg}' .format(record=record_info, err_type=err_type, err_msg=create_status['info'])) except Exception as e: logger.error( "Exception raised when saving record of city '{city}': {e}" .format(city=city_name, e=repr(e))) except Exception as e: logger.error( "Exception raised when parsing page of city '{city}': {e}". format(city=city_name, e=repr(e))) finally: # backup file file_name = '{}.html'.format(city_name) with open(os.path.join(spider.res_dir, file_name), 'wb') as f: f.write(item['page']) logger.info("Successfully backup the page of city '{}'".format( city_name))
def handle(self, *args, **options): pages_dir = options['pages_dir'] client = None if options['ssh']: # Use sftp to access file logging_file = options['E'] if logging_file: logging_file = os.path.expanduser(logging_file) port = options['port'] # get hostname username = '' if pages_dir.find(':') > 0: hostname, pages_dir = pages_dir.split(':') if hostname.find('@') >= 0: username, hostname = hostname.split('@') else: hostname = input('Hostname: ') if len(hostname) == 0: raise CommandError('*** Hostname required.') # get username if username == '': username = getpass.getuser() # get password password = options['password'] # connect try: client = get_ssh_client(hostname, username, password, port, log_file=logging_file) except ssh_exception.PasswordRequiredException: raise CommandError('Password is required.') except ssh_exception.AuthenticationException: raise CommandError('Authentication failed.') except ssh_exception.ServerNotKnown: raise CommandError('Unknown server.') except ssh_exception.ConnectionTimeOut: raise CommandError('Connection timed out.') # Collect any error info for each file errors = [] # Collect update datetime for each successful creation classified by city success = {} try: with transaction.atomic(): for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options['city']): info_dict = utils.append_extra_fileds( extractors.process_parsed_dict( extractors.parse_info_dict( extractors.extract_info(content) ) ) ) name_en = re.search(r'([a-z]+)\.html', file_name).group(1) info_dict['city']['area_en'] = name_en rel_path = os.path.relpath(file_name, pages_dir) result = create_city_record(info_dict) if result['success'] == 0: del result['success'] result['file'] = rel_path result['info_dict'] = info_dict errors.append(result) else: if name_en not in success: success[name_en] = [] record = result['info'] success[name_en].append(record.update_dtm) except Exception as e: self.stderr.write('Exception raised from {}: {}'.format(file_name, repr(e))) raise e error_num = len(errors) success_num_by_city = dict(map(lambda item: (item[0], len(item[1])), success)) success_num = sum(success_num_by_city.values()) self.stdout.write('Successfully collect {} city records.'.format(success_num)) for city, num in success_num_by_city.items(): self.stdout.write('{}: {}'.format(city, num)) self.stdout.write('Fail to create {} city records.'.format(error_num)) for error in errors: self.stdout.write('file: {}, type: {}, info: {}'.format( error['file'], error['error_type'], error['info'] )) if error['error_type'] in {'ValidationError' or 'ValueError'}: self.stdout.write('{}'.format(pprint.pformat(error['info_dict'])))
def handle(self, *args, **options): pages_dir = options['pages_dir'] client = None if options['ssh']: # Use sftp to access file logging_file = options['E'] if logging_file: logging_file = os.path.expanduser(logging_file) port = options['port'] # get hostname username = '' if pages_dir.find(':') > 0: hostname, pages_dir = pages_dir.split(':') if hostname.find('@') >= 0: username, hostname = hostname.split('@') else: hostname = input('Hostname: ') if len(hostname) == 0: raise CommandError('*** Hostname required.') # get username if username == '': username = getpass.getuser() # get password password = options['password'] # connect try: client = get_ssh_client(hostname, username, password, port, log_file=logging_file) except ssh_exception.PasswordRequiredException: raise CommandError('Password is required.') except ssh_exception.AuthenticationException: raise CommandError('Authentication failed.') except ssh_exception.ServerNotKnown: raise CommandError('Unknown server.') except ssh_exception.ConnectionTimeOut: raise CommandError('Connection timed out.') station_duplicates = {} city_duplicates = {} total_city = 0 total_station = 0 try: with transaction.atomic(): for file_name, content in get_html_files_from_dir( pages_dir, client, city_names=options['city']): total_city += 1 info_dict = extractors.process_parsed_dict( extractors.parse_info_dict( extractors.extract_info(content))) name_en = re.search(r'([a-z]+)\.html', file_name).group(1) name_cn = info_dict['city']['area_cn'] try: city = City.objects.validate_and_create( name_en=name_en, name_cn=name_cn) except ValidationError: self.count_duplicates(city_duplicates, name_en) continue for station_name in info_dict['stations']: total_station += 1 try: Station.objects.validate_and_create( name_cn=station_name, city=city) except ValidationError: self.count_duplicates(station_duplicates, (name_en, station_name)) except Exception as e: self.stderr.write('Exception raised :{}'.format(repr(e))) raise e # print result self.stdout.write( 'Total cities scanned: {}. {} new city info is saved.'.format( total_city, total_city - self.calc_duplicates(city_duplicates))) self.stdout.write('Duplicate cities are: {}'.format( pprint.pformat(city_duplicates, indent=4))) self.stdout.write( 'Total stations scanned: {}. {} new station info is saved.'.format( total_station, total_station - self.calc_duplicates(station_duplicates))) self.stdout.write('Duplicate stations are: {}'.format( pprint.pformat(station_duplicates, indent=4))) if client: client.close()
def handle(self, *args, **options): pages_dir = options["pages_dir"] client = None if options["ssh"]: # Use sftp to access file logging_file = options["E"] if logging_file: logging_file = os.path.expanduser(logging_file) port = options["port"] # get hostname username = "" if pages_dir.find(":") > 0: hostname, pages_dir = pages_dir.split(":") if hostname.find("@") >= 0: username, hostname = hostname.split("@") else: hostname = input("Hostname: ") if len(hostname) == 0: raise CommandError("*** Hostname required.") # get username if username == "": username = getpass.getuser() # get password password = options["password"] # connect try: client = get_ssh_client(hostname, username, password, port, log_file=logging_file) except ssh_exception.PasswordRequiredException: raise CommandError("Password is required.") except ssh_exception.AuthenticationException: raise CommandError("Authentication failed.") except ssh_exception.ServerNotKnown: raise CommandError("Unknown server.") except ssh_exception.ConnectionTimeOut: raise CommandError("Connection timed out.") station_duplicates = {} city_duplicates = {} total_city = 0 total_station = 0 try: with transaction.atomic(): for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options["city"]): total_city += 1 info_dict = extractors.process_parsed_dict( extractors.parse_info_dict(extractors.extract_info(content)) ) name_en = re.search(r"([a-z]+)\.html", file_name).group(1) name_cn = info_dict["city"]["area_cn"] try: city = City.objects.validate_and_create(name_en=name_en, name_cn=name_cn) except ValidationError: self.count_duplicates(city_duplicates, name_en) continue for station_name in info_dict["stations"]: total_station += 1 try: Station.objects.validate_and_create(name_cn=station_name, city=city) except ValidationError: self.count_duplicates(station_duplicates, (name_en, station_name)) except Exception as e: self.stderr.write("Exception raised :{}".format(repr(e))) raise e # print result self.stdout.write( "Total cities scanned: {}. {} new city info is saved.".format( total_city, total_city - self.calc_duplicates(city_duplicates) ) ) self.stdout.write("Duplicate cities are: {}".format(pprint.pformat(city_duplicates, indent=4))) self.stdout.write( "Total stations scanned: {}. {} new station info is saved.".format( total_station, total_station - self.calc_duplicates(station_duplicates) ) ) self.stdout.write("Duplicate stations are: {}".format(pprint.pformat(station_duplicates, indent=4))) if client: client.close()