Beispiel #1
0
    def __init__(self):
        parser = argparse.ArgumentParser(
            description='Parse database export script')
        parser.add_argument('--url',
                            help='Process only this URL',
                            type=str,
                            required=False)
        self.args = parser.parse_args()

        self.cache = FileCache(namespace='germany-cities',
                               path=os.environ.get('CACHE_PATH'))
        self.net = NetworkManager()
Beispiel #2
0
    def __init__(self, id, buffer):
        self.buffer = buffer
        self.html = fromstring(buffer)
        self.id = id
        self.alternative_title = None
        self.countries = list()
        self.countries_to_save = list()
        self.slogan = None
        self.persons = list()
        self.length = None
        self.year = None
        self.cast = list()
        self.ratings = list()
        self.genres = list()
        self.rating_kinopoisk = None
        self.rating_imdb = None
        self.rating_critics = None
        self.age_restriction = None
        self.premieres = list()
        self.world_premiere = None
        self.dates = list()
        self.boxes = list()
        self.rating_mpaa = None
        self.production_status = None
        self.full_id = self.get_full_id()
        logger.info('Full ID = %s' % self.full_id)

        self.cache = FileCache(namespace='kinopoisk', path=os.environ.get('CACHE_PATH'))
        self.net = NetworkManager()

        self.parse()
Beispiel #3
0
    def __init__(self):
        parser = argparse.ArgumentParser(description='kinopoisk.ru parser')
        parser.add_argument('--year', type=int, help='Year to process')
        parser.add_argument('--hostname',
                            type=str,
                            help='Hostname',
                            required=False,
                            default=gethostname())
        parser.add_argument('--film-id', type=int, help='Film ID')
        parser.add_argument('--sleep-time',
                            type=int,
                            help='Max sleep time between requests',
                            default=20)
        parser.add_argument('--total',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--read-only',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--update',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--start-page',
                            required=False,
                            default=1,
                            type=int)
        parser.add_argument('--persons',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--from-id', required=False, default=1, type=int)
        parser.add_argument('--to-id', required=False, default=None)
        self.args = parser.parse_args()

        self.cache = FileCache(namespace='kinopoisk',
                               path=os.environ.get('CACHE_PATH'))
        self.net = NetworkManager()
        # Initialization of database connection
        db.connect(config.dsn)

        if self.args.year is not None:
            self.set_year(self.args.year)
Beispiel #4
0
    def __init__(self):
        logging.basicConfig(
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            level=logging.INFO,
            stream=sys.stdout)
        logger.setLevel(logging.INFO)
        logging.getLogger('cache').setLevel(logging.INFO)

        super(App, self).__init__()

        self.parser.add_argument('--post', type=str, help='Post to parse')
        self.parser.add_argument('--from-year',
                                 type=int,
                                 help='Year to start parse from',
                                 default=2006)
        self.parser.add_argument('--from-month',
                                 type=int,
                                 help='Month to start parse from',
                                 default=1)
        self.parser.add_argument(
            '--update',
            action='store_true',
            default=False,
            help='Do not use cache to construct post list')
        self.parser.add_argument('--db',
                                 type=str,
                                 help='Database DSN',
                                 default=db)
        self.parser.add_argument('--image',
                                 type=str,
                                 help='Process one image and exit')
        self.args = self.parser.parse_args()

        self.net = NetworkManager()
        self.cache = FileCache(path=cache_path, namespace='varlamov.ru')

        self.conn = psycopg2.connect(self.args.db)
        self.conn.set_isolation_level(
            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
Beispiel #5
0
 def __init__(self):
     self.cache = FileCache(namespace='russian-cities',
                            path=os.environ.get('CACHE_PATH'))
     self.net = NetworkManager()
Beispiel #6
0
class App(BasicParser):

    url_template = 'http://varlamov.ru/%(year)s/%(month)02d'

    def is_captcha_required(self, data):
        return False

    def __init__(self):
        logging.basicConfig(
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            level=logging.INFO,
            stream=sys.stdout)
        logger.setLevel(logging.INFO)
        logging.getLogger('cache').setLevel(logging.INFO)

        super(App, self).__init__()

        self.parser.add_argument('--post', type=str, help='Post to parse')
        self.parser.add_argument('--from-year',
                                 type=int,
                                 help='Year to start parse from',
                                 default=2006)
        self.parser.add_argument('--from-month',
                                 type=int,
                                 help='Month to start parse from',
                                 default=1)
        self.parser.add_argument(
            '--update',
            action='store_true',
            default=False,
            help='Do not use cache to construct post list')
        self.parser.add_argument('--db',
                                 type=str,
                                 help='Database DSN',
                                 default=db)
        self.parser.add_argument('--image',
                                 type=str,
                                 help='Process one image and exit')
        self.args = self.parser.parse_args()

        self.net = NetworkManager()
        self.cache = FileCache(path=cache_path, namespace='varlamov.ru')

        self.conn = psycopg2.connect(self.args.db)
        self.conn.set_isolation_level(
            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)

    def get_sleep_time(self):
        return 0

    def fix_url(self, url):
        if url.startswith('http:/varlamov'):
            return url.replace('http:/varlamov', 'http://varlamov')
        if url.startswith('http:////'):
            return url.replace('http:////', 'http://')
        if url.startswith('http:///'):
            return url.replace('http:///', 'http://')
        return url

    def is_date_valid(self, date):
        accepted_formats = [
            '%Y-%m-%d %H:%M:%S', '%Y:%m:%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S',
            '%Y:%m:%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%SZ', '%Y:%m:%dT%H:%M:%SZ',
            '%Y-%m-%dT%H:%MZ'
        ]
        for try_format in accepted_formats:
            try:
                d = datetime.datetime.strptime(date, try_format)
                logger.info('Date %s is valid', date)
                return True
            except ValueError:
                continue

        logger.info('Date %s is NOT valid', date)
        return False

    def process_image(self, post_id, url):
        def extract_tag(tags, tag_name):
            if tags.get(tag_name) is None or str(
                    tags.get(tag_name)).strip() == '':
                return None
            return str(tags.get(tag_name))

        url = self.fix_url(url)

        if post_id is not None:
            image_id = self.get_image_id(post_id, url)

            # Если информация об изображении есть в базе данных,
            # будем считать, что изображение обработано, - это ускоряет
            # обработку при запуске после неожиданной остановки
            if image_id is not None:
                return True

        try:
            data = self.get_page(url, binary=True)
        except (InternalServerError, InvalidSchema, UnicodeError):
            return False

        if data is None:
            return False

        fp = open(self.cache.get_cached_filename(url), 'rb')

        try:
            image = Image.open(fp)
        except IOError:
            logger.error('Could not read image %s' % url)
            return False

        logger.info('Image size: %s x %s' % (image.size))

        fp.seek(0)
        try:
            tags = exifread.process_file(fp)
        except (UnicodeEncodeError, TypeError) as e:
            logger.error('Could not extract EXIF tags: %s' % str(e))
            tags = {}

        image_object = {
            'post_id': post_id,
            'url': url,
            'width': image.size[0],
            'height': image.size[1],
            'file_size': self.cache.get_file_size(url),
            'exif_camera_model': extract_tag(tags, 'Image Model'),
            'exif_focal_length': extract_tag(tags, 'EXIF FocalLength'),
            'exif_exposure_time': extract_tag(tags, 'EXIF ExposureTime'),
            'exif_date_time': extract_tag(tags, 'EXIF DateTimeOriginal'),
            'exif_aperture_value': extract_tag(tags, 'EXIF FNumber'),
            'exif_iso': extract_tag(tags, 'EXIF ISOSpeedRatings')
        }

        logger.info('Image: %s' % image_object)

        if image_object['exif_date_time'] is not None and \
            not self.is_date_valid(image_object['exif_date_time']):
            image_object['exif_date_time'] = None

        # --image <image_url>, - process and exit
        if post_id is None:
            return

        self.save_image(image_object)

    def get_date(self, date_as_string):
        logger.info('get_date(%s)', date_as_string)
        month_mapping = {
            u'января': 1,
            u'февраля': 2,
            u'марта': 3,
            u'апреля': 4,
            u'мая': 5,
            u'июня': 6,
            u'июля': 7,
            u'августа': 8,
            u'сентября': 9,
            u'октября': 10,
            u'ноября': 11,
            u'декабря': 12
        }

        if date_as_string is None:
            return None
        if re.match('\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', date_as_string) and \
            self.is_date_valid(date_as_string):
            return date_as_string
        if ',' in date_as_string:
            # Попробуем интерпретировать дату как русскую дату
            date_parts = date_as_string.split(',')

            day_parts = date_parts[0].split(' ')

            if day_parts[1] not in month_mapping.keys():
                return None

            date_as_string = '%s-%02d-%02dT%sZ' % (day_parts[2], month_mapping[
                day_parts[1]], int(day_parts[0]), date_parts[1].strip())
            if self.is_date_valid(date_as_string):
                return date_as_string
            else:
                return None
        else:
            return None

    def extract_tags(self, html):
        result = list()
        for meta in html.xpath('//meta[@property="article:tag"]'):
            result.append(meta.get('content'))
        return result

    def process_post(self, post):
        logger.info('Pricessing post in URL %s', post)
        page = self.get_page(post['url'])

        if page is None:
            return False

        html = fromstring(page)

        content = html.xpath('//div[@id="entrytext"]')

        if len(content) < 1:
            return False

        title = html.xpath('//meta[@property="og:title"]')

        if len(title) > 0:
            post['title'] = title[0].get('content')
        else:
            raise Exception('Could not find title')

        date_published = html.xpath('//time[@itemprop="datePublished"]')

        if len(date_published) > 0:
            post['date_published'] = date_published[0].text_content()
        else:
            date_published = html.xpath('//time[@itemprop="dateCreated"]')
            if len(date_published) > 0:
                post['date_published'] = date_published[0].text_content()
            else:
                post['date_published'] = None

        date_modified = html.xpath('//time[@itemprop="dateModified"]')

        if len(date_modified) > 0:
            post['date_modified'] = date_modified[0].text_content()
        else:
            post['date_modified'] = None

        post['date_modified'] = self.get_date(post['date_modified'])
        post['date_published'] = self.get_date(post['date_published'])
        post['tags'] = self.extract_tags(html)

        logger.info('Tags: %s' % post['tags'])

        post_id = self.save_post(post)

        for img in content[0].xpath('.//img'):
            url = img.get('src')

            if url is None or url == '' or \
               url.endswith('.ico') or \
               url.endswith('.svg') or \
               url.endswith('.gif'):
                continue

            if url.startswith('//'):
                url = 'http:%s' % url

            logger.info(url)
            self.process_image(post_id, url)

    def save_post(self, post):
        cursor = self.conn.cursor()

        query_check = '''
        select id from public.post where url = %(url)s
        '''

        query_insert = '''
        insert into public.post (url, title, date_published, date_modified, tags)
        values (%(url)s, %(title)s, %(date_published)s, %(date_modified)s, %(tags)s)
        returning id
        '''

        query_update = '''
        update post
           set title = %(title)s,
               date_modified = %(date_modified)s,
               date_published = %(date_published)s,
               tags = %(tags)s
         where id = %(id)s
        '''

        cursor.execute(query_check, post)
        result = cursor.fetchone()

        if result is None:
            cursor.execute(query_insert, post)
            result = cursor.fetchone()
            cursor.close()
            return result[0]
        else:
            logger.info('Post exists, id = %s, updating...' % result[0])
            post.update({'id': result[0]})
            cursor.execute(query_update, post)
            cursor.close()
            return result[0]

    def get_image_id(self, post_id, url):
        query_check = '''
        select id from public.image
         where post_id = %(post_id)s
           and url = %(url)s
        '''
        cursor = self.conn.cursor()
        cursor.execute(query_check, locals())
        result = cursor.fetchone()

        if result is None:
            cursor.close()
            return None
        else:
            cursor.close()
            return result[0]

    def save_image(self, image):
        cursor = self.conn.cursor()

        query_insert = '''
        insert into public.image (post_id, url, width, height, file_size,
                                  exif_camera_model, exif_focal_length,
                                  exif_exposure_time, exif_date_time, exif_aperture_value, exif_iso)
        values (%(post_id)s, %(url)s, %(width)s, %(height)s, %(file_size)s,
                %(exif_camera_model)s, %(exif_focal_length)s, %(exif_exposure_time)s,
                to_timestamp(%(exif_date_time)s, 'yyyy:mm:dd HH24:mi:ss'),
                %(exif_aperture_value)s, %(exif_iso)s)
        returning id
        '''

        image_id = self.get_image_id(image['post_id'], image['url'])

        if image_id is None:
            cursor.execute(query_insert, image)
            result = cursor.fetchone()
            cursor.close()
            return result[0]
        else:
            cursor.close()
            return result[0]

    def extract_posts_from_range(self):
        posts_count = 0

        for year in range(self.args.from_year,
                          datetime.datetime.today().year + 1):
            for month in range(self.args.from_month, 13):
                logger.info('%s/%s' % (month, year))

                page = self.get_page(self.url_template % locals())
                if page is None:
                    continue

                html = fromstring(page)

                for a_item in html.xpath('//a[@class="j-day-subject-link"]'):
                    posts_count += 1
                    url = a_item.get('href')

                    logging.info('PROCESSING POST %s' % posts_count)
                    logging.info('%s / %s' % (url, a_item.text_content()))

                    try:
                        self.process_post({'url': url})
                    except (PageDownloadException, PageNotFound):
                        logger.error('Network error, continue')
                    except Exception as e:
                        logger.error('Could not parse post: %s' % str(e))
                        raise

    def run(self, argv):

        if self.args.post is not None:
            self.process_post({'url': self.args.post})
        elif self.args.image is not None:
            self.process_image(None, self.args.image)
        else:
            self.extract_posts_from_range()

        self.conn.close()
Beispiel #7
0
class App(BasicParser):

    base = 'https://www.kinopoisk.ru'
    total_count = None
    current_page = None
    total_pages = None

    def __init__(self):
        parser = argparse.ArgumentParser(description='kinopoisk.ru parser')
        parser.add_argument('--year', type=int, help='Year to process')
        parser.add_argument('--hostname',
                            type=str,
                            help='Hostname',
                            required=False,
                            default=gethostname())
        parser.add_argument('--film-id', type=int, help='Film ID')
        parser.add_argument('--sleep-time',
                            type=int,
                            help='Max sleep time between requests',
                            default=20)
        parser.add_argument('--total',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--read-only',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--update',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--start-page',
                            required=False,
                            default=1,
                            type=int)
        parser.add_argument('--persons',
                            required=False,
                            default=False,
                            action='store_true')
        parser.add_argument('--from-id', required=False, default=1, type=int)
        parser.add_argument('--to-id', required=False, default=None)
        self.args = parser.parse_args()

        self.cache = FileCache(namespace='kinopoisk',
                               path=os.environ.get('CACHE_PATH'))
        self.net = NetworkManager()
        # Initialization of database connection
        db.connect(config.dsn)

        if self.args.year is not None:
            self.set_year(self.args.year)

    def set_year(self, year):
        config.year = year

    def get_page_with_captcha(self, page_text):
        html = fromstring(page_text)
        # Get captcha image URL
        img = html.xpath('//div[@class="captcha__image"]//img')
        captcha_url = img[0].get('src')
        # Get captcha key
        input_captcha_key = html.xpath('//input[@class="form__key"]')
        captcha_key = input_captcha_key[0].get('value')
        # Get return path
        input_retpath = html.xpath('//input[@class="form__retpath"]')
        retpath = input_retpath[0].get('value')

        logger.info('Captcha URL = %s, key = %s' % (captcha_url, captcha_key))

        r = requests.get(captcha_url, stream=True)
        if r.status_code != 200:
            raise Exception('Could not download captcha image')
        captcha_filename = self.cache.get_cached_filename(captcha_url)
        with open(captcha_filename, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)

        solver = CaptchaSolver(captcha_filename)
        task_id = solver.CreateTask()
        time.sleep(10)
        solution = solver.GetTaskResult(task_id)
        if solution is None:
            raise GetPageError('Could not solve captcha')

        params = {'key': captcha_key, 'retpath': retpath, 'rep': solution}
        r = requests.get('https://www.kinopoisk.ru/checkcaptcha',
                         params=params)
        # /checkcaptcha example:
        # https://www.kinopoisk.ru/checkcaptcha?key=<key>&retpath=<retpath>&rep=%D0%BB%D1%8E%D0%BD%D0%B3%D1%81%D1%82%D0%B0%D0%B4
        if r.status_code == 200:
            logger.info('CAPTCHA SOLVED!!!')

        return r

    def is_captcha_required(self, data):
        if 'captchaimg' in data:
            raise Exception('Captcha')
        return 'captchaimg' in data

    def solve_captcha(self, data):
        self.get_page_with_captcha(data)

    def get_rating_history(self, film_id):
        """
        /graph_data/variation_data_film/243/variation_data_film_810243.xml? + Math.random(0,10000)
        """
        return

    def get_pages_count(self, year, force_download=False):
        logger.info('Getting pages count for year %s' % year)
        page = self.get_page(self.get_url_for_year(year))
        html = fromstring(page)
        a = html.xpath(
            '//div[@class="paginator"]//a[@class="paginator__page-number"][last()]'
        )
        if a is None or len(a) == 0:
            pages_count = 1
        else:

            pages_count = int(a[0].text_content())

        logger.info('Pages count = %s', pages_count)

        div = html.xpath('//div[@class="selections-seo-page__meta-info"]')
        if div is not None and len(div) > 0:
            self.total_count = int(re.sub('[^\d]', '', div[0].text_content()))
        else:
            raise Exception('Could not get total records count!')

        logger.info('Got total_count = %s' % self.total_count)
        self.total_pages = pages_count
        return pages_count

    def get_url_for_year(self, year, page=1):
        return '%s/lists/navigator/%s/?page=%s' % (
            self.base,
            year,
            page,
        )

    def extract_id_from_url(self, url):
        if re.match('^/film/(\d+)/$', url):
            # Old URL format
            # /film/1049041/
            m = re.search('^/film/(\d+)/$', url)
            return int(m.group(1))
        else:
            # New URL format
            # /film/pyewacket-2017-1004054/
            m = re.search('-(\d+)/$', url)
            return int(m.group(1))

    def get_films_from_page(self, url, force_download=False):
        page = self.get_page(url)
        html = fromstring(page)
        for item in html.xpath(
                '//div[contains(@class, "selections-film-item")]//a[@class="selection-film-item-meta__link"]'
        ):
            p = item.xpath('.//p[@class="selection-film-item-meta__name"]')
            title = p[0].text_content()
            href = item.get('href')
            id = self.extract_id_from_url(href)
            yield (id, title, href)

    def get_film_url(self, film_id):
        return '%s/film/%s/' % (
            self.base,
            film_id,
        )

    def get_film(self, film_id):
        """
        Extracts all informarion about film
        """
        page = self.get_page(self.get_film_url(film_id))
        film = Film(film_id, page)

        logger.info('%s (%s) | %s' % (
            film.title,
            film.alternative_title,
            film.year,
        ))
        return film

    def get_current_count(self):
        return db.query_value(
            'select count(*) from mdb.movie where year = %s' % config.year)

    def update_stat(self, last_movie_id):
        id = db.query_value('select id from mdb.stat where year = %s',
                            [config.year])
        if id is None:
            db.execute(
                'insert into mdb.stat (year, done_count, total_count, hostname, '
                'last_movie_id, current_page, total_pages) '
                'values (%s, %s, %s, %s, %s, %s, %s)', [
                    config.year,
                    self.get_current_count(), self.total_count,
                    self.args.hostname, last_movie_id, self.current_page,
                    self.total_pages
                ])
        else:
            db.execute(
                'update mdb.stat set done_count = %s, total_count = %s, hostname = %s, '
                'last_update_time = current_timestamp, last_movie_id = %s, '
                'current_page = %s, total_pages = %s '
                'where year = %s', [
                    self.get_current_count(), self.total_count,
                    self.args.hostname, last_movie_id, self.current_page,
                    self.total_pages, config.year
                ])

    def update_total(self):
        id = db.query_value('select id from mdb.stat where year = %s',
                            [config.year])
        if id is None:
            db.execute(
                'insert into mdb.stat (year, done_count, total_count, hostname, '
                'last_movie_id, total_pages) '
                'values (%s, %s, %s, %s, %s, %s)', [
                    config.year, 0, self.total_count, None, None,
                    self.total_pages
                ])

    def log_error(self, movie_id, message):
        """
        TODO: movie_id -> object_id
        """
        logger.error('Could not parse movie %s: "%s"' % (
            movie_id,
            message,
        ))
        db.execute(
            'insert into mdb.error(hostname, movie_id, message) '
            'values (%s, %s, %s)', [self.args.hostname, movie_id, message])

    def is_film_exists(self, movie_id):
        return db.query_value('select count(*) from mdb.movie where id = %s',
                              [movie_id]) > 0

    def get_year(self, year, update_mode=False):
        logger.info('======= Processing year %s =======' % year)
        for page_number in range(
                self.args.start_page,
                self.get_pages_count(year, force_download=update_mode) + 1):
            self.current_page = page_number
            logger.info("Processing page %s" % page_number)
            for id, title, href in self.get_films_from_page(
                    self.get_url_for_year(year, page_number),
                    force_download=update_mode):
                if update_mode and self.is_film_exists(id) is True:
                    continue
                if update_mode and self.is_film_exists(id) is False:
                    logger.warning('New film found')

                logger.info('%s | %s | %s' % (
                    id,
                    title,
                    href,
                ))

                #try:
                f = self.get_film(id)
                if self.args.read_only is False:
                    f.save()
                #except Exception as e:
                #    self.log_error(id, str(e))
                logger.warning('%s from %s' % (
                    self.get_current_count(),
                    self.total_count,
                ))
                if self.args.read_only is False:
                    self.update_stat(id)
        # После получения всех страниц года нужно сбросить счётчик страниц,
        # чтобы новый год начинать извлекать всегда с первой страницы
        self.args.start_page = 1

    def update_persons(self):
        query = "select id from mdb.person " \
                " where id between %s and coalesce(%s, 999999999) " \
                "   and parsed_extra = false " \
                " order by id"
        for person in db.query_dict(query,
                                    [self.args.from_id, self.args.to_id]):
            logger.info('Parsing person with ID = %s', person['id'])
            try:
                person = Person(person['id'])
                person.save()
            except Exception as e:
                logger.error('Could not process person %s' % person['id'])
                self.log_error(person['id'],
                               'Could not process person: %s' % str(e))

    def run(self):
        if self.args.persons is True:
            self.update_persons()
            return
        if self.args.total is True:
            logger.warning('======= Updating total stat =======')
            for year in range(1890, date.today().year + 1):
                logger.warning('Year %s' % year)
                config.year = year
                self.get_pages_count(year)
                self.update_total()
            return
        elif self.args.update is True:
            logger.warning('Running in UPDATE mode')
        elif self.args.film_id is not None:
            logger.warning('======= Processing film %s =======' %
                           self.args.film_id)
            f = self.get_film(self.args.film_id)
            f.save()
            sys.exit(0)

        while config.year <= date.today().year + 1:
            self.get_year(config.year, update_mode=self.args.update)
            self.set_year(config.year + 1)
Beispiel #8
0
class App(BasicParser):

    data = list()

    def __init__(self):
        parser = argparse.ArgumentParser(
            description='Parse database export script')
        parser.add_argument('--url',
                            help='Process only this URL',
                            type=str,
                            required=False)
        self.args = parser.parse_args()

        self.cache = FileCache(namespace='germany-cities',
                               path=os.environ.get('CACHE_PATH'))
        self.net = NetworkManager()

    def get_url(self, url):
        if url.startswith('https://'):
            return url
        else:
            return '%s%s' % (URL_ROOT, url)

    def get_city_info(self, url):
        def get_td(th):
            td = th.xpath('./following-sibling::td[1]')
            return td[0].text_content().strip()

        def get_population(th):
            td = th.getparent().xpath('./following-sibling::tr//td[1]')
            return td[0].text_content().replace(',', '')

        def get_area(th):
            td = th.getparent().xpath('./following-sibling::tr//td[1]')
            return td[0].text_content().split('\xa0km')[0]

        info = {}

        page = self.get_page(self.get_url(url))

        if self.args.url:
            print(self.cache.get_cached_filename(url), file=sys.stderr)
        html = fromstring(page)

        th1 = html.xpath(
            './/table[contains(@class, "geography")]//tbody//tr//th[1]//div[@style="display:inline"]'
        )
        info['name'] = th1[0].text_content().strip()

        geo = html.xpath('.//span[@class="geo"]')
        if geo is not None:
            info['coords'] = {
                'lat': geo[0].text_content().split('; ')[0],
                'lon': geo[0].text_content().split('; ')[1]
            }

        for th in html.xpath(
                './/table[contains(@class, "geography")][1]//tr//th'):
            title = th.text_content().strip()

            if title == 'District':
                info['district'] = get_td(th)
            elif title.startswith('Population'):
                info['population'] = get_population(th)
            elif (title == 'Area'
                  or title == 'Area[1]') and not 'area' in info.keys():
                # We need to consider only the first occurance of 'Area',
                # FIXME: check for 'Area[1]' is only for Berlin
                info['area'] = get_area(th)

        return info

    def get_state(self, li):
        return re.search('\(([^()]+)\)$', li.text_content())[1]

    def run(self):
        if self.args.url is not None:
            info = self.get_city_info(self.args.url)
            print(json.dumps(info))
            sys.exit(0)

        page = self.get_page(
            'https://en.m.wikipedia.org/wiki/List_of_cities_and_towns_in_Germany'
        )
        html = fromstring(page)

        for a1 in html.xpath('.//table//tbody//tr//ul//li//a[1]'):

            # Skip pages that don't exist yet
            if a1.get('class') == 'new':
                continue

            info = self.get_city_info(a1.get('href'))
            # There are pages that don't contain "State" on it so we have to
            # parse the main list page to get it
            info['state'] = self.get_state(a1.getparent())

            if info is not None:
                print(info['name'], file=sys.stderr)
                self.data.append(info)
            else:
                print("Couldn't get info", file=sys.stderr)

        # As there can be other cities with the exact same name in a single
        # state, sorting the list by just the city name is not appropriate
        output = sorted(self.data,
                        key=lambda k: '%s|%s' % (k['name'], k['state']))
        print(json.dumps(output, ensure_ascii=False, sort_keys=True))