def __init__(self): parser = argparse.ArgumentParser( description='Parse database export script') parser.add_argument('--url', help='Process only this URL', type=str, required=False) self.args = parser.parse_args() self.cache = FileCache(namespace='germany-cities', path=os.environ.get('CACHE_PATH')) self.net = NetworkManager()
def __init__(self, id, buffer): self.buffer = buffer self.html = fromstring(buffer) self.id = id self.alternative_title = None self.countries = list() self.countries_to_save = list() self.slogan = None self.persons = list() self.length = None self.year = None self.cast = list() self.ratings = list() self.genres = list() self.rating_kinopoisk = None self.rating_imdb = None self.rating_critics = None self.age_restriction = None self.premieres = list() self.world_premiere = None self.dates = list() self.boxes = list() self.rating_mpaa = None self.production_status = None self.full_id = self.get_full_id() logger.info('Full ID = %s' % self.full_id) self.cache = FileCache(namespace='kinopoisk', path=os.environ.get('CACHE_PATH')) self.net = NetworkManager() self.parse()
def __init__(self): parser = argparse.ArgumentParser(description='kinopoisk.ru parser') parser.add_argument('--year', type=int, help='Year to process') parser.add_argument('--hostname', type=str, help='Hostname', required=False, default=gethostname()) parser.add_argument('--film-id', type=int, help='Film ID') parser.add_argument('--sleep-time', type=int, help='Max sleep time between requests', default=20) parser.add_argument('--total', required=False, default=False, action='store_true') parser.add_argument('--read-only', required=False, default=False, action='store_true') parser.add_argument('--update', required=False, default=False, action='store_true') parser.add_argument('--start-page', required=False, default=1, type=int) parser.add_argument('--persons', required=False, default=False, action='store_true') parser.add_argument('--from-id', required=False, default=1, type=int) parser.add_argument('--to-id', required=False, default=None) self.args = parser.parse_args() self.cache = FileCache(namespace='kinopoisk', path=os.environ.get('CACHE_PATH')) self.net = NetworkManager() # Initialization of database connection db.connect(config.dsn) if self.args.year is not None: self.set_year(self.args.year)
def __init__(self): logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, stream=sys.stdout) logger.setLevel(logging.INFO) logging.getLogger('cache').setLevel(logging.INFO) super(App, self).__init__() self.parser.add_argument('--post', type=str, help='Post to parse') self.parser.add_argument('--from-year', type=int, help='Year to start parse from', default=2006) self.parser.add_argument('--from-month', type=int, help='Month to start parse from', default=1) self.parser.add_argument( '--update', action='store_true', default=False, help='Do not use cache to construct post list') self.parser.add_argument('--db', type=str, help='Database DSN', default=db) self.parser.add_argument('--image', type=str, help='Process one image and exit') self.args = self.parser.parse_args() self.net = NetworkManager() self.cache = FileCache(path=cache_path, namespace='varlamov.ru') self.conn = psycopg2.connect(self.args.db) self.conn.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
def __init__(self): self.cache = FileCache(namespace='russian-cities', path=os.environ.get('CACHE_PATH')) self.net = NetworkManager()
class App(BasicParser): url_template = 'http://varlamov.ru/%(year)s/%(month)02d' def is_captcha_required(self, data): return False def __init__(self): logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, stream=sys.stdout) logger.setLevel(logging.INFO) logging.getLogger('cache').setLevel(logging.INFO) super(App, self).__init__() self.parser.add_argument('--post', type=str, help='Post to parse') self.parser.add_argument('--from-year', type=int, help='Year to start parse from', default=2006) self.parser.add_argument('--from-month', type=int, help='Month to start parse from', default=1) self.parser.add_argument( '--update', action='store_true', default=False, help='Do not use cache to construct post list') self.parser.add_argument('--db', type=str, help='Database DSN', default=db) self.parser.add_argument('--image', type=str, help='Process one image and exit') self.args = self.parser.parse_args() self.net = NetworkManager() self.cache = FileCache(path=cache_path, namespace='varlamov.ru') self.conn = psycopg2.connect(self.args.db) self.conn.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) def get_sleep_time(self): return 0 def fix_url(self, url): if url.startswith('http:/varlamov'): return url.replace('http:/varlamov', 'http://varlamov') if url.startswith('http:////'): return url.replace('http:////', 'http://') if url.startswith('http:///'): return url.replace('http:///', 'http://') return url def is_date_valid(self, date): accepted_formats = [ '%Y-%m-%d %H:%M:%S', '%Y:%m:%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y:%m:%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%SZ', '%Y:%m:%dT%H:%M:%SZ', '%Y-%m-%dT%H:%MZ' ] for try_format in accepted_formats: try: d = datetime.datetime.strptime(date, try_format) logger.info('Date %s is valid', date) return True except ValueError: continue logger.info('Date %s is NOT valid', date) return False def process_image(self, post_id, url): def extract_tag(tags, tag_name): if tags.get(tag_name) is None or str( tags.get(tag_name)).strip() == '': return None return str(tags.get(tag_name)) url = self.fix_url(url) if post_id is not None: image_id = self.get_image_id(post_id, url) # Если информация об изображении есть в базе данных, # будем считать, что изображение обработано, - это ускоряет # обработку при запуске после неожиданной остановки if image_id is not None: return True try: data = self.get_page(url, binary=True) except (InternalServerError, InvalidSchema, UnicodeError): return False if data is None: return False fp = open(self.cache.get_cached_filename(url), 'rb') try: image = Image.open(fp) except IOError: logger.error('Could not read image %s' % url) return False logger.info('Image size: %s x %s' % (image.size)) fp.seek(0) try: tags = exifread.process_file(fp) except (UnicodeEncodeError, TypeError) as e: logger.error('Could not extract EXIF tags: %s' % str(e)) tags = {} image_object = { 'post_id': post_id, 'url': url, 'width': image.size[0], 'height': image.size[1], 'file_size': self.cache.get_file_size(url), 'exif_camera_model': extract_tag(tags, 'Image Model'), 'exif_focal_length': extract_tag(tags, 'EXIF FocalLength'), 'exif_exposure_time': extract_tag(tags, 'EXIF ExposureTime'), 'exif_date_time': extract_tag(tags, 'EXIF DateTimeOriginal'), 'exif_aperture_value': extract_tag(tags, 'EXIF FNumber'), 'exif_iso': extract_tag(tags, 'EXIF ISOSpeedRatings') } logger.info('Image: %s' % image_object) if image_object['exif_date_time'] is not None and \ not self.is_date_valid(image_object['exif_date_time']): image_object['exif_date_time'] = None # --image <image_url>, - process and exit if post_id is None: return self.save_image(image_object) def get_date(self, date_as_string): logger.info('get_date(%s)', date_as_string) month_mapping = { u'января': 1, u'февраля': 2, u'марта': 3, u'апреля': 4, u'мая': 5, u'июня': 6, u'июля': 7, u'августа': 8, u'сентября': 9, u'октября': 10, u'ноября': 11, u'декабря': 12 } if date_as_string is None: return None if re.match('\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', date_as_string) and \ self.is_date_valid(date_as_string): return date_as_string if ',' in date_as_string: # Попробуем интерпретировать дату как русскую дату date_parts = date_as_string.split(',') day_parts = date_parts[0].split(' ') if day_parts[1] not in month_mapping.keys(): return None date_as_string = '%s-%02d-%02dT%sZ' % (day_parts[2], month_mapping[ day_parts[1]], int(day_parts[0]), date_parts[1].strip()) if self.is_date_valid(date_as_string): return date_as_string else: return None else: return None def extract_tags(self, html): result = list() for meta in html.xpath('//meta[@property="article:tag"]'): result.append(meta.get('content')) return result def process_post(self, post): logger.info('Pricessing post in URL %s', post) page = self.get_page(post['url']) if page is None: return False html = fromstring(page) content = html.xpath('//div[@id="entrytext"]') if len(content) < 1: return False title = html.xpath('//meta[@property="og:title"]') if len(title) > 0: post['title'] = title[0].get('content') else: raise Exception('Could not find title') date_published = html.xpath('//time[@itemprop="datePublished"]') if len(date_published) > 0: post['date_published'] = date_published[0].text_content() else: date_published = html.xpath('//time[@itemprop="dateCreated"]') if len(date_published) > 0: post['date_published'] = date_published[0].text_content() else: post['date_published'] = None date_modified = html.xpath('//time[@itemprop="dateModified"]') if len(date_modified) > 0: post['date_modified'] = date_modified[0].text_content() else: post['date_modified'] = None post['date_modified'] = self.get_date(post['date_modified']) post['date_published'] = self.get_date(post['date_published']) post['tags'] = self.extract_tags(html) logger.info('Tags: %s' % post['tags']) post_id = self.save_post(post) for img in content[0].xpath('.//img'): url = img.get('src') if url is None or url == '' or \ url.endswith('.ico') or \ url.endswith('.svg') or \ url.endswith('.gif'): continue if url.startswith('//'): url = 'http:%s' % url logger.info(url) self.process_image(post_id, url) def save_post(self, post): cursor = self.conn.cursor() query_check = ''' select id from public.post where url = %(url)s ''' query_insert = ''' insert into public.post (url, title, date_published, date_modified, tags) values (%(url)s, %(title)s, %(date_published)s, %(date_modified)s, %(tags)s) returning id ''' query_update = ''' update post set title = %(title)s, date_modified = %(date_modified)s, date_published = %(date_published)s, tags = %(tags)s where id = %(id)s ''' cursor.execute(query_check, post) result = cursor.fetchone() if result is None: cursor.execute(query_insert, post) result = cursor.fetchone() cursor.close() return result[0] else: logger.info('Post exists, id = %s, updating...' % result[0]) post.update({'id': result[0]}) cursor.execute(query_update, post) cursor.close() return result[0] def get_image_id(self, post_id, url): query_check = ''' select id from public.image where post_id = %(post_id)s and url = %(url)s ''' cursor = self.conn.cursor() cursor.execute(query_check, locals()) result = cursor.fetchone() if result is None: cursor.close() return None else: cursor.close() return result[0] def save_image(self, image): cursor = self.conn.cursor() query_insert = ''' insert into public.image (post_id, url, width, height, file_size, exif_camera_model, exif_focal_length, exif_exposure_time, exif_date_time, exif_aperture_value, exif_iso) values (%(post_id)s, %(url)s, %(width)s, %(height)s, %(file_size)s, %(exif_camera_model)s, %(exif_focal_length)s, %(exif_exposure_time)s, to_timestamp(%(exif_date_time)s, 'yyyy:mm:dd HH24:mi:ss'), %(exif_aperture_value)s, %(exif_iso)s) returning id ''' image_id = self.get_image_id(image['post_id'], image['url']) if image_id is None: cursor.execute(query_insert, image) result = cursor.fetchone() cursor.close() return result[0] else: cursor.close() return result[0] def extract_posts_from_range(self): posts_count = 0 for year in range(self.args.from_year, datetime.datetime.today().year + 1): for month in range(self.args.from_month, 13): logger.info('%s/%s' % (month, year)) page = self.get_page(self.url_template % locals()) if page is None: continue html = fromstring(page) for a_item in html.xpath('//a[@class="j-day-subject-link"]'): posts_count += 1 url = a_item.get('href') logging.info('PROCESSING POST %s' % posts_count) logging.info('%s / %s' % (url, a_item.text_content())) try: self.process_post({'url': url}) except (PageDownloadException, PageNotFound): logger.error('Network error, continue') except Exception as e: logger.error('Could not parse post: %s' % str(e)) raise def run(self, argv): if self.args.post is not None: self.process_post({'url': self.args.post}) elif self.args.image is not None: self.process_image(None, self.args.image) else: self.extract_posts_from_range() self.conn.close()
class App(BasicParser): base = 'https://www.kinopoisk.ru' total_count = None current_page = None total_pages = None def __init__(self): parser = argparse.ArgumentParser(description='kinopoisk.ru parser') parser.add_argument('--year', type=int, help='Year to process') parser.add_argument('--hostname', type=str, help='Hostname', required=False, default=gethostname()) parser.add_argument('--film-id', type=int, help='Film ID') parser.add_argument('--sleep-time', type=int, help='Max sleep time between requests', default=20) parser.add_argument('--total', required=False, default=False, action='store_true') parser.add_argument('--read-only', required=False, default=False, action='store_true') parser.add_argument('--update', required=False, default=False, action='store_true') parser.add_argument('--start-page', required=False, default=1, type=int) parser.add_argument('--persons', required=False, default=False, action='store_true') parser.add_argument('--from-id', required=False, default=1, type=int) parser.add_argument('--to-id', required=False, default=None) self.args = parser.parse_args() self.cache = FileCache(namespace='kinopoisk', path=os.environ.get('CACHE_PATH')) self.net = NetworkManager() # Initialization of database connection db.connect(config.dsn) if self.args.year is not None: self.set_year(self.args.year) def set_year(self, year): config.year = year def get_page_with_captcha(self, page_text): html = fromstring(page_text) # Get captcha image URL img = html.xpath('//div[@class="captcha__image"]//img') captcha_url = img[0].get('src') # Get captcha key input_captcha_key = html.xpath('//input[@class="form__key"]') captcha_key = input_captcha_key[0].get('value') # Get return path input_retpath = html.xpath('//input[@class="form__retpath"]') retpath = input_retpath[0].get('value') logger.info('Captcha URL = %s, key = %s' % (captcha_url, captcha_key)) r = requests.get(captcha_url, stream=True) if r.status_code != 200: raise Exception('Could not download captcha image') captcha_filename = self.cache.get_cached_filename(captcha_url) with open(captcha_filename, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) solver = CaptchaSolver(captcha_filename) task_id = solver.CreateTask() time.sleep(10) solution = solver.GetTaskResult(task_id) if solution is None: raise GetPageError('Could not solve captcha') params = {'key': captcha_key, 'retpath': retpath, 'rep': solution} r = requests.get('https://www.kinopoisk.ru/checkcaptcha', params=params) # /checkcaptcha example: # https://www.kinopoisk.ru/checkcaptcha?key=<key>&retpath=<retpath>&rep=%D0%BB%D1%8E%D0%BD%D0%B3%D1%81%D1%82%D0%B0%D0%B4 if r.status_code == 200: logger.info('CAPTCHA SOLVED!!!') return r def is_captcha_required(self, data): if 'captchaimg' in data: raise Exception('Captcha') return 'captchaimg' in data def solve_captcha(self, data): self.get_page_with_captcha(data) def get_rating_history(self, film_id): """ /graph_data/variation_data_film/243/variation_data_film_810243.xml? + Math.random(0,10000) """ return def get_pages_count(self, year, force_download=False): logger.info('Getting pages count for year %s' % year) page = self.get_page(self.get_url_for_year(year)) html = fromstring(page) a = html.xpath( '//div[@class="paginator"]//a[@class="paginator__page-number"][last()]' ) if a is None or len(a) == 0: pages_count = 1 else: pages_count = int(a[0].text_content()) logger.info('Pages count = %s', pages_count) div = html.xpath('//div[@class="selections-seo-page__meta-info"]') if div is not None and len(div) > 0: self.total_count = int(re.sub('[^\d]', '', div[0].text_content())) else: raise Exception('Could not get total records count!') logger.info('Got total_count = %s' % self.total_count) self.total_pages = pages_count return pages_count def get_url_for_year(self, year, page=1): return '%s/lists/navigator/%s/?page=%s' % ( self.base, year, page, ) def extract_id_from_url(self, url): if re.match('^/film/(\d+)/$', url): # Old URL format # /film/1049041/ m = re.search('^/film/(\d+)/$', url) return int(m.group(1)) else: # New URL format # /film/pyewacket-2017-1004054/ m = re.search('-(\d+)/$', url) return int(m.group(1)) def get_films_from_page(self, url, force_download=False): page = self.get_page(url) html = fromstring(page) for item in html.xpath( '//div[contains(@class, "selections-film-item")]//a[@class="selection-film-item-meta__link"]' ): p = item.xpath('.//p[@class="selection-film-item-meta__name"]') title = p[0].text_content() href = item.get('href') id = self.extract_id_from_url(href) yield (id, title, href) def get_film_url(self, film_id): return '%s/film/%s/' % ( self.base, film_id, ) def get_film(self, film_id): """ Extracts all informarion about film """ page = self.get_page(self.get_film_url(film_id)) film = Film(film_id, page) logger.info('%s (%s) | %s' % ( film.title, film.alternative_title, film.year, )) return film def get_current_count(self): return db.query_value( 'select count(*) from mdb.movie where year = %s' % config.year) def update_stat(self, last_movie_id): id = db.query_value('select id from mdb.stat where year = %s', [config.year]) if id is None: db.execute( 'insert into mdb.stat (year, done_count, total_count, hostname, ' 'last_movie_id, current_page, total_pages) ' 'values (%s, %s, %s, %s, %s, %s, %s)', [ config.year, self.get_current_count(), self.total_count, self.args.hostname, last_movie_id, self.current_page, self.total_pages ]) else: db.execute( 'update mdb.stat set done_count = %s, total_count = %s, hostname = %s, ' 'last_update_time = current_timestamp, last_movie_id = %s, ' 'current_page = %s, total_pages = %s ' 'where year = %s', [ self.get_current_count(), self.total_count, self.args.hostname, last_movie_id, self.current_page, self.total_pages, config.year ]) def update_total(self): id = db.query_value('select id from mdb.stat where year = %s', [config.year]) if id is None: db.execute( 'insert into mdb.stat (year, done_count, total_count, hostname, ' 'last_movie_id, total_pages) ' 'values (%s, %s, %s, %s, %s, %s)', [ config.year, 0, self.total_count, None, None, self.total_pages ]) def log_error(self, movie_id, message): """ TODO: movie_id -> object_id """ logger.error('Could not parse movie %s: "%s"' % ( movie_id, message, )) db.execute( 'insert into mdb.error(hostname, movie_id, message) ' 'values (%s, %s, %s)', [self.args.hostname, movie_id, message]) def is_film_exists(self, movie_id): return db.query_value('select count(*) from mdb.movie where id = %s', [movie_id]) > 0 def get_year(self, year, update_mode=False): logger.info('======= Processing year %s =======' % year) for page_number in range( self.args.start_page, self.get_pages_count(year, force_download=update_mode) + 1): self.current_page = page_number logger.info("Processing page %s" % page_number) for id, title, href in self.get_films_from_page( self.get_url_for_year(year, page_number), force_download=update_mode): if update_mode and self.is_film_exists(id) is True: continue if update_mode and self.is_film_exists(id) is False: logger.warning('New film found') logger.info('%s | %s | %s' % ( id, title, href, )) #try: f = self.get_film(id) if self.args.read_only is False: f.save() #except Exception as e: # self.log_error(id, str(e)) logger.warning('%s from %s' % ( self.get_current_count(), self.total_count, )) if self.args.read_only is False: self.update_stat(id) # После получения всех страниц года нужно сбросить счётчик страниц, # чтобы новый год начинать извлекать всегда с первой страницы self.args.start_page = 1 def update_persons(self): query = "select id from mdb.person " \ " where id between %s and coalesce(%s, 999999999) " \ " and parsed_extra = false " \ " order by id" for person in db.query_dict(query, [self.args.from_id, self.args.to_id]): logger.info('Parsing person with ID = %s', person['id']) try: person = Person(person['id']) person.save() except Exception as e: logger.error('Could not process person %s' % person['id']) self.log_error(person['id'], 'Could not process person: %s' % str(e)) def run(self): if self.args.persons is True: self.update_persons() return if self.args.total is True: logger.warning('======= Updating total stat =======') for year in range(1890, date.today().year + 1): logger.warning('Year %s' % year) config.year = year self.get_pages_count(year) self.update_total() return elif self.args.update is True: logger.warning('Running in UPDATE mode') elif self.args.film_id is not None: logger.warning('======= Processing film %s =======' % self.args.film_id) f = self.get_film(self.args.film_id) f.save() sys.exit(0) while config.year <= date.today().year + 1: self.get_year(config.year, update_mode=self.args.update) self.set_year(config.year + 1)
class App(BasicParser): data = list() def __init__(self): parser = argparse.ArgumentParser( description='Parse database export script') parser.add_argument('--url', help='Process only this URL', type=str, required=False) self.args = parser.parse_args() self.cache = FileCache(namespace='germany-cities', path=os.environ.get('CACHE_PATH')) self.net = NetworkManager() def get_url(self, url): if url.startswith('https://'): return url else: return '%s%s' % (URL_ROOT, url) def get_city_info(self, url): def get_td(th): td = th.xpath('./following-sibling::td[1]') return td[0].text_content().strip() def get_population(th): td = th.getparent().xpath('./following-sibling::tr//td[1]') return td[0].text_content().replace(',', '') def get_area(th): td = th.getparent().xpath('./following-sibling::tr//td[1]') return td[0].text_content().split('\xa0km')[0] info = {} page = self.get_page(self.get_url(url)) if self.args.url: print(self.cache.get_cached_filename(url), file=sys.stderr) html = fromstring(page) th1 = html.xpath( './/table[contains(@class, "geography")]//tbody//tr//th[1]//div[@style="display:inline"]' ) info['name'] = th1[0].text_content().strip() geo = html.xpath('.//span[@class="geo"]') if geo is not None: info['coords'] = { 'lat': geo[0].text_content().split('; ')[0], 'lon': geo[0].text_content().split('; ')[1] } for th in html.xpath( './/table[contains(@class, "geography")][1]//tr//th'): title = th.text_content().strip() if title == 'District': info['district'] = get_td(th) elif title.startswith('Population'): info['population'] = get_population(th) elif (title == 'Area' or title == 'Area[1]') and not 'area' in info.keys(): # We need to consider only the first occurance of 'Area', # FIXME: check for 'Area[1]' is only for Berlin info['area'] = get_area(th) return info def get_state(self, li): return re.search('\(([^()]+)\)$', li.text_content())[1] def run(self): if self.args.url is not None: info = self.get_city_info(self.args.url) print(json.dumps(info)) sys.exit(0) page = self.get_page( 'https://en.m.wikipedia.org/wiki/List_of_cities_and_towns_in_Germany' ) html = fromstring(page) for a1 in html.xpath('.//table//tbody//tr//ul//li//a[1]'): # Skip pages that don't exist yet if a1.get('class') == 'new': continue info = self.get_city_info(a1.get('href')) # There are pages that don't contain "State" on it so we have to # parse the main list page to get it info['state'] = self.get_state(a1.getparent()) if info is not None: print(info['name'], file=sys.stderr) self.data.append(info) else: print("Couldn't get info", file=sys.stderr) # As there can be other cities with the exact same name in a single # state, sorting the list by just the city name is not appropriate output = sorted(self.data, key=lambda k: '%s|%s' % (k['name'], k['state'])) print(json.dumps(output, ensure_ascii=False, sort_keys=True))