Example #1
0
 def gen_info_dict():
     for name, content in get_html_files_from_dir(pages_dir, client):
         city_name = re.search(r'([a-z]+)\.html', name).group(1)
         info_dict = extractors.extract_info(content)
         total[0] += 1
         if not info_dict[list(info_dict.keys())[0]]:
             empty_cities.append(city_name)
             continue
         yield info_dict
Example #2
0
 def gen_info_dict():
     for name, content in get_html_files_from_dir(pages_dir, client):
         city_name = re.search(r'([a-z]+)\.html', name).group(1)
         info_dict = extractors.extract_info(content)
         total[0] += 1
         if not info_dict[list(info_dict.keys())[0]]:
             empty_cities.append(city_name)
             continue
         yield info_dict
Example #3
0
    def handle(self, *args, **options):
        pages_dir = options['pages_dir']
        client = None

        if options['ssh']:
            # Use sftp to access file
            logging_file = options['E']
            if logging_file:
                logging_file = os.path.expanduser(logging_file)

            port = options['port']
            # get hostname
            username = ''
            if pages_dir.find(':') > 0:
                hostname, pages_dir = pages_dir.split(':')
                if hostname.find('@') >= 0:
                    username, hostname = hostname.split('@')
            else:
                hostname = input('Hostname: ')
            if len(hostname) == 0:
                raise CommandError('*** Hostname required.')

            # get username
            if username == '':
                username = getpass.getuser()

            # get password
            password = options['password']

            # connect
            try:
                client = get_ssh_client(hostname, username, password, port, log_file=logging_file)
            except ssh_exception.PasswordRequiredException:
                raise CommandError('Password is required.')
            except ssh_exception.AuthenticationException:
                raise CommandError('Authentication failed.')
            except ssh_exception.ServerNotKnown:
                raise CommandError('Unknown server.')
            except ssh_exception.ConnectionTimeOut:
                raise CommandError('Connection timed out.')

        # Collect any error info for each file
        errors = []
        # Collect update datetime for each successful creation classified by city
        success = {}
        try:
            with transaction.atomic():
                for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options['city']):
                    info_dict = utils.append_extra_fileds(
                        extractors.process_parsed_dict(
                            extractors.parse_info_dict(
                                extractors.extract_info(content)
                            )
                        )
                    )
                    name_en = re.search(r'([a-z]+)\.html', file_name).group(1)
                    info_dict['city']['area_en'] = name_en

                    rel_path = os.path.relpath(file_name, pages_dir)

                    result = create_city_record(info_dict)
                    if result['success'] == 0:
                        del result['success']
                        result['file'] = rel_path
                        result['info_dict'] = info_dict
                        errors.append(result)
                    else:
                        if name_en not in success:
                            success[name_en] = []

                        record = result['info']
                        success[name_en].append(record.update_dtm)
        except Exception as e:
            self.stderr.write('Exception raised from {}: {}'.format(file_name, repr(e)))
            raise e

        error_num = len(errors)
        success_num_by_city = dict(map(lambda item: (item[0], len(item[1])), success))
        success_num = sum(success_num_by_city.values())

        self.stdout.write('Successfully collect {} city records.'.format(success_num))
        for city, num in success_num_by_city.items():
            self.stdout.write('{}: {}'.format(city, num))

        self.stdout.write('Fail to create {} city records.'.format(error_num))
        for error in errors:
            self.stdout.write('file: {}, type: {}, info: {}'.format(
                error['file'], error['error_type'], error['info']
            ))
            if error['error_type'] in {'ValidationError' or 'ValueError'}:
                self.stdout.write('{}'.format(pprint.pformat(error['info_dict'])))
Example #4
0
    def handle(self, *args, **options):
        pages_dir = options['pages_dir']
        client = None

        if options['ssh']:
            # Use sftp to access file
            logging_file = options['E']
            if logging_file:
                logging_file = os.path.expanduser(logging_file)

            port = options['port']
            # get hostname
            username = ''
            if pages_dir.find(':') > 0:
                hostname, pages_dir = pages_dir.split(':')
                if hostname.find('@') >= 0:
                    username, hostname = hostname.split('@')
            else:
                hostname = input('Hostname: ')
            if len(hostname) == 0:
                raise CommandError('*** Hostname required.')

            # get username
            if username == '':
                username = getpass.getuser()

            # get password
            password = options['password']

            # connect
            try:
                client = get_ssh_client(hostname,
                                        username,
                                        password,
                                        port,
                                        log_file=logging_file)
            except ssh_exception.PasswordRequiredException:
                raise CommandError('Password is required.')
            except ssh_exception.AuthenticationException:
                raise CommandError('Authentication failed.')
            except ssh_exception.ServerNotKnown:
                raise CommandError('Unknown server.')
            except ssh_exception.ConnectionTimeOut:
                raise CommandError('Connection timed out.')

        station_duplicates = {}
        city_duplicates = {}
        total_city = 0
        total_station = 0

        try:
            with transaction.atomic():
                for file_name, content in get_html_files_from_dir(
                        pages_dir, client, city_names=options['city']):
                    total_city += 1
                    info_dict = extractors.process_parsed_dict(
                        extractors.parse_info_dict(
                            extractors.extract_info(content)))
                    name_en = re.search(r'([a-z]+)\.html', file_name).group(1)
                    name_cn = info_dict['city']['area_cn']
                    try:
                        city = City.objects.validate_and_create(
                            name_en=name_en, name_cn=name_cn)
                    except ValidationError:
                        self.count_duplicates(city_duplicates, name_en)
                        continue

                    for station_name in info_dict['stations']:
                        total_station += 1
                        try:
                            Station.objects.validate_and_create(
                                name_cn=station_name, city=city)
                        except ValidationError:
                            self.count_duplicates(station_duplicates,
                                                  (name_en, station_name))
        except Exception as e:
            self.stderr.write('Exception raised :{}'.format(repr(e)))
            raise e

        # print result
        self.stdout.write(
            'Total cities scanned: {}. {} new city info is saved.'.format(
                total_city,
                total_city - self.calc_duplicates(city_duplicates)))
        self.stdout.write('Duplicate cities are: {}'.format(
            pprint.pformat(city_duplicates, indent=4)))
        self.stdout.write(
            'Total stations scanned: {}. {} new station info is saved.'.format(
                total_station,
                total_station - self.calc_duplicates(station_duplicates)))
        self.stdout.write('Duplicate stations are: {}'.format(
            pprint.pformat(station_duplicates, indent=4)))

        if client:
            client.close()
Example #5
0
    def handle(self, *args, **options):
        pages_dir = options["pages_dir"]
        client = None

        if options["ssh"]:
            # Use sftp to access file
            logging_file = options["E"]
            if logging_file:
                logging_file = os.path.expanduser(logging_file)

            port = options["port"]
            # get hostname
            username = ""
            if pages_dir.find(":") > 0:
                hostname, pages_dir = pages_dir.split(":")
                if hostname.find("@") >= 0:
                    username, hostname = hostname.split("@")
            else:
                hostname = input("Hostname: ")
            if len(hostname) == 0:
                raise CommandError("*** Hostname required.")

            # get username
            if username == "":
                username = getpass.getuser()

            # get password
            password = options["password"]

            # connect
            try:
                client = get_ssh_client(hostname, username, password, port, log_file=logging_file)
            except ssh_exception.PasswordRequiredException:
                raise CommandError("Password is required.")
            except ssh_exception.AuthenticationException:
                raise CommandError("Authentication failed.")
            except ssh_exception.ServerNotKnown:
                raise CommandError("Unknown server.")
            except ssh_exception.ConnectionTimeOut:
                raise CommandError("Connection timed out.")

        station_duplicates = {}
        city_duplicates = {}
        total_city = 0
        total_station = 0

        try:
            with transaction.atomic():
                for file_name, content in get_html_files_from_dir(pages_dir, client, city_names=options["city"]):
                    total_city += 1
                    info_dict = extractors.process_parsed_dict(
                        extractors.parse_info_dict(extractors.extract_info(content))
                    )
                    name_en = re.search(r"([a-z]+)\.html", file_name).group(1)
                    name_cn = info_dict["city"]["area_cn"]
                    try:
                        city = City.objects.validate_and_create(name_en=name_en, name_cn=name_cn)
                    except ValidationError:
                        self.count_duplicates(city_duplicates, name_en)
                        continue

                    for station_name in info_dict["stations"]:
                        total_station += 1
                        try:
                            Station.objects.validate_and_create(name_cn=station_name, city=city)
                        except ValidationError:
                            self.count_duplicates(station_duplicates, (name_en, station_name))
        except Exception as e:
            self.stderr.write("Exception raised :{}".format(repr(e)))
            raise e

        # print result
        self.stdout.write(
            "Total cities scanned: {}. {} new city info is saved.".format(
                total_city, total_city - self.calc_duplicates(city_duplicates)
            )
        )
        self.stdout.write("Duplicate cities are: {}".format(pprint.pformat(city_duplicates, indent=4)))
        self.stdout.write(
            "Total stations scanned: {}. {} new station info is saved.".format(
                total_station, total_station - self.calc_duplicates(station_duplicates)
            )
        )
        self.stdout.write("Duplicate stations are: {}".format(pprint.pformat(station_duplicates, indent=4)))

        if client:
            client.close()