Beispiel #1
0
 def gen_info_dict():
     for name, content in get_html_files_from_dir(pages_dir, client):
         city_name = re.search(r'([a-z]+)\.html', name).group(1)
         info_dict = extractors.extract_info(content)
         total[0] += 1
         if not info_dict[list(info_dict.keys())[0]]:
             empty_cities.append(city_name)
             continue
         yield info_dict
Beispiel #2
0
 def gen_info_dict():
     for name, content in get_html_files_from_dir(pages_dir, client):
         city_name = re.search(r'([a-z]+)\.html', name).group(1)
         info_dict = extractors.extract_info(content)
         total[0] += 1
         if not info_dict[list(info_dict.keys())[0]]:
             empty_cities.append(city_name)
             continue
         yield info_dict
Beispiel #3
0
    def process_item(self, item, spider):
        logger = spider.custom_logger
        if isinstance(item, PageItem):
            city_name = item['name']
            page_content = item['page'].decode()
            # save record to database first
            try:
                info_dict = utils.append_extra_fileds(
                    extractors.process_parsed_dict(
                        extractors.parse_info_dict(
                            extractors.extract_info(page_content)
                        )
                    )
                )

                try:
                    info_dict['city']['area_en'] = city_name

                    create_status = models.create_city_record(info_dict)
                    record_info = '{name} on {dtm}'.format(name=city_name, dtm=info_dict['update_dtm'])
                    if create_status['success'] == 1:
                        logger.info('Successfully save a record: {}'.format(record_info))
                    else:
                        err_type = create_status['error_type']
                        if err_type == 'UniquenessError':
                            logger.warn('Ignore duplicate record: {}'.format(record_info))
                        else:
                            logger.error('Fail to save record: {record} because of {err_type}: {err_msg}'.format(
                                record=record_info,
                                err_type=err_type,
                                err_msg=create_status['info']
                            ))
                except Exception as e:
                    logger.error("Exception raised when saving record of city '{city}': {e}".format(
                        city=city_name,
                        e=repr(e)
                    ))

            except Exception as e:
                logger.error("Exception raised when parsing page of city '{city}': {e}".format(
                    city=city_name,
                    e=repr(e)
                ))
            finally:
                # backup file
                file_name = '{}.html'.format(city_name)
                with open(os.path.join(spider.res_dir, file_name), 'wb') as f:
                    f.write(item['page'])
                logger.info("Successfully backup the page of city '{}'".format(city_name))
Beispiel #4
0
    def process_item(self, item, spider):
        logger = spider.custom_logger
        if isinstance(item, PageItem):
            city_name = item['name']
            page_content = item['page'].decode()
            # save record to database first
            try:
                info_dict = utils.append_extra_fileds(
                    extractors.process_parsed_dict(
                        extractors.parse_info_dict(
                            extractors.extract_info(page_content))))

                try:
                    info_dict['city']['area_en'] = city_name

                    create_status = models.create_city_record(info_dict)
                    record_info = '{name} on {dtm}'.format(
                        name=city_name, dtm=info_dict['update_dtm'])
                    if create_status['success'] == 1:
                        logger.info('Successfully save a record: {}'.format(
                            record_info))
                    else:
                        err_type = create_status['error_type']
                        if err_type == 'UniquenessError':
                            logger.warn('Ignore duplicate record: {}'.format(
                                record_info))
                        else:
                            logger.error(
                                'Fail to save record: {record} because of {err_type}: {err_msg}'
                                .format(record=record_info,
                                        err_type=err_type,
                                        err_msg=create_status['info']))
                except Exception as e:
                    logger.error(
                        "Exception raised when saving record of city '{city}': {e}"
                        .format(city=city_name, e=repr(e)))

            except Exception as e:
                logger.error(
                    "Exception raised when parsing page of city '{city}': {e}".
                    format(city=city_name, e=repr(e)))
            finally:
                # backup file
                file_name = '{}.html'.format(city_name)
                with open(os.path.join(spider.res_dir, file_name), 'wb') as f:
                    f.write(item['page'])
                logger.info("Successfully backup the page of city '{}'".format(
                    city_name))
    def handle(self, *args, **options):
        pages_dir = options['pages_dir']
        client = None

        if options['ssh']:
            # Use sftp to access file
            logging_file = options['E']
            if logging_file:
                logging_file = os.path.expanduser(logging_file)

            port = options['port']
            # get hostname
            username = ''
            if pages_dir.find(':') > 0:
                hostname, pages_dir = pages_dir.split(':')
                if hostname.find('@') >= 0:
                    username, hostname = hostname.split('@')
            else:
                hostname = input('Hostname: ')
            if len(hostname) == 0:
                raise CommandError('*** Hostname required.')

            # get username
            if username == '':
                username = getpass.getuser()

            # get password
            password = options['password']

            # connect
            try:
                client = get_ssh_client(hostname, username, password, port, log_file=logging_file)
            except ssh_exception.PasswordRequiredException:
                raise CommandError('Password is required.')
            except ssh_exception.AuthenticationException:
                raise CommandError('Authentication failed.')
            except ssh_exception.ServerNotKnown:
                raise CommandError('Unknown server.')
            except ssh_exception.ConnectionTimeOut:
                raise CommandError('Connection timed out.')

        # Collect any error info for each file
        errors = []
        # Collect update datetime for each successful creation classified by city
        success = {}
        try:
            with transaction.atomic():
                for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options['city']):
                    info_dict = utils.append_extra_fileds(
                        extractors.process_parsed_dict(
                            extractors.parse_info_dict(
                                extractors.extract_info(content)
                            )
                        )
                    )
                    name_en = re.search(r'([a-z]+)\.html', file_name).group(1)
                    info_dict['city']['area_en'] = name_en

                    rel_path = os.path.relpath(file_name, pages_dir)

                    result = create_city_record(info_dict)
                    if result['success'] == 0:
                        del result['success']
                        result['file'] = rel_path
                        result['info_dict'] = info_dict
                        errors.append(result)
                    else:
                        if name_en not in success:
                            success[name_en] = []

                        record = result['info']
                        success[name_en].append(record.update_dtm)
        except Exception as e:
            self.stderr.write('Exception raised from {}: {}'.format(file_name, repr(e)))
            raise e

        error_num = len(errors)
        success_num_by_city = dict(map(lambda item: (item[0], len(item[1])), success))
        success_num = sum(success_num_by_city.values())

        self.stdout.write('Successfully collect {} city records.'.format(success_num))
        for city, num in success_num_by_city.items():
            self.stdout.write('{}: {}'.format(city, num))

        self.stdout.write('Fail to create {} city records.'.format(error_num))
        for error in errors:
            self.stdout.write('file: {}, type: {}, info: {}'.format(
                error['file'], error['error_type'], error['info']
            ))
            if error['error_type'] in {'ValidationError' or 'ValueError'}:
                self.stdout.write('{}'.format(pprint.pformat(error['info_dict'])))
Beispiel #6
0
    def handle(self, *args, **options):
        pages_dir = options['pages_dir']
        client = None

        if options['ssh']:
            # Use sftp to access file
            logging_file = options['E']
            if logging_file:
                logging_file = os.path.expanduser(logging_file)

            port = options['port']
            # get hostname
            username = ''
            if pages_dir.find(':') > 0:
                hostname, pages_dir = pages_dir.split(':')
                if hostname.find('@') >= 0:
                    username, hostname = hostname.split('@')
            else:
                hostname = input('Hostname: ')
            if len(hostname) == 0:
                raise CommandError('*** Hostname required.')

            # get username
            if username == '':
                username = getpass.getuser()

            # get password
            password = options['password']

            # connect
            try:
                client = get_ssh_client(hostname,
                                        username,
                                        password,
                                        port,
                                        log_file=logging_file)
            except ssh_exception.PasswordRequiredException:
                raise CommandError('Password is required.')
            except ssh_exception.AuthenticationException:
                raise CommandError('Authentication failed.')
            except ssh_exception.ServerNotKnown:
                raise CommandError('Unknown server.')
            except ssh_exception.ConnectionTimeOut:
                raise CommandError('Connection timed out.')

        station_duplicates = {}
        city_duplicates = {}
        total_city = 0
        total_station = 0

        try:
            with transaction.atomic():
                for file_name, content in get_html_files_from_dir(
                        pages_dir, client, city_names=options['city']):
                    total_city += 1
                    info_dict = extractors.process_parsed_dict(
                        extractors.parse_info_dict(
                            extractors.extract_info(content)))
                    name_en = re.search(r'([a-z]+)\.html', file_name).group(1)
                    name_cn = info_dict['city']['area_cn']
                    try:
                        city = City.objects.validate_and_create(
                            name_en=name_en, name_cn=name_cn)
                    except ValidationError:
                        self.count_duplicates(city_duplicates, name_en)
                        continue

                    for station_name in info_dict['stations']:
                        total_station += 1
                        try:
                            Station.objects.validate_and_create(
                                name_cn=station_name, city=city)
                        except ValidationError:
                            self.count_duplicates(station_duplicates,
                                                  (name_en, station_name))
        except Exception as e:
            self.stderr.write('Exception raised :{}'.format(repr(e)))
            raise e

        # print result
        self.stdout.write(
            'Total cities scanned: {}. {} new city info is saved.'.format(
                total_city,
                total_city - self.calc_duplicates(city_duplicates)))
        self.stdout.write('Duplicate cities are: {}'.format(
            pprint.pformat(city_duplicates, indent=4)))
        self.stdout.write(
            'Total stations scanned: {}. {} new station info is saved.'.format(
                total_station,
                total_station - self.calc_duplicates(station_duplicates)))
        self.stdout.write('Duplicate stations are: {}'.format(
            pprint.pformat(station_duplicates, indent=4)))

        if client:
            client.close()
    def handle(self, *args, **options):
        pages_dir = options["pages_dir"]
        client = None

        if options["ssh"]:
            # Use sftp to access file
            logging_file = options["E"]
            if logging_file:
                logging_file = os.path.expanduser(logging_file)

            port = options["port"]
            # get hostname
            username = ""
            if pages_dir.find(":") > 0:
                hostname, pages_dir = pages_dir.split(":")
                if hostname.find("@") >= 0:
                    username, hostname = hostname.split("@")
            else:
                hostname = input("Hostname: ")
            if len(hostname) == 0:
                raise CommandError("*** Hostname required.")

            # get username
            if username == "":
                username = getpass.getuser()

            # get password
            password = options["password"]

            # connect
            try:
                client = get_ssh_client(hostname, username, password, port, log_file=logging_file)
            except ssh_exception.PasswordRequiredException:
                raise CommandError("Password is required.")
            except ssh_exception.AuthenticationException:
                raise CommandError("Authentication failed.")
            except ssh_exception.ServerNotKnown:
                raise CommandError("Unknown server.")
            except ssh_exception.ConnectionTimeOut:
                raise CommandError("Connection timed out.")

        station_duplicates = {}
        city_duplicates = {}
        total_city = 0
        total_station = 0

        try:
            with transaction.atomic():
                for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options["city"]):
                    total_city += 1
                    info_dict = extractors.process_parsed_dict(
                        extractors.parse_info_dict(extractors.extract_info(content))
                    )
                    name_en = re.search(r"([a-z]+)\.html", file_name).group(1)
                    name_cn = info_dict["city"]["area_cn"]
                    try:
                        city = City.objects.validate_and_create(name_en=name_en, name_cn=name_cn)
                    except ValidationError:
                        self.count_duplicates(city_duplicates, name_en)
                        continue

                    for station_name in info_dict["stations"]:
                        total_station += 1
                        try:
                            Station.objects.validate_and_create(name_cn=station_name, city=city)
                        except ValidationError:
                            self.count_duplicates(station_duplicates, (name_en, station_name))
        except Exception as e:
            self.stderr.write("Exception raised :{}".format(repr(e)))
            raise e

        # print result
        self.stdout.write(
            "Total cities scanned: {}. {} new city info is saved.".format(
                total_city, total_city - self.calc_duplicates(city_duplicates)
            )
        )
        self.stdout.write("Duplicate cities are: {}".format(pprint.pformat(city_duplicates, indent=4)))
        self.stdout.write(
            "Total stations scanned: {}. {} new station info is saved.".format(
                total_station, total_station - self.calc_duplicates(station_duplicates)
            )
        )
        self.stdout.write("Duplicate stations are: {}".format(pprint.pformat(station_duplicates, indent=4)))

        if client:
            client.close()