def commit(self): something_changed = False if not self.image.exists: self.image.commit() for k, v in self.MAPPER: k_value, v_value = self.db_instance.__getattribute__( k), self.__getattribute__(v) if str(k_value) != str(v_value): self.db_instance.__setattr__(k, self.__getattribute__(v)) something_changed = True logger.info(k, ' changed from ', k_value, ' to ', v_value) if something_changed: self.session.add(self.db_instance) self.session.commit() else: print(f'Nothing was changed for {self.db_instance.id}')
async def get_data(url): if is_redirect(url): logger.warning(f'URL is a redirect: {url}') raise IsRedirectError cache_dict = json.load(open(CACHE_DICT_PATH, 'r')) file_name = cache_dict.get(url, None) if not file_name: file_name = '{}.html'.format(str(uuid4())[:8]) contents = await cache_html(url, file_name) cache_dict[url] = file_name json.dump(cache_dict, open(CACHE_DICT_PATH, 'w')) else: try: contents = open(f'{CACHED_FOLDER}/{file_name}', 'r').read() logger.info(f'Using cached: {file_name} for url: {url}') except FileNotFoundError: del cache_dict[url] json.dump(cache_dict, open(CACHE_DICT_PATH, 'w')) return await get_data(url) return contents
def cache_html(url, name, attempts=1): # proxies = { # 'http': 'socks5://127.0.0.1:9050', # } if attempts > MAX_GET_ATTEMPTS: logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') logger.info(f'GET: {url}') if attempts > 1: logger.info(f'attempt: {attempts}') site = requests.get(url, headers=HEADERS()) site.encoding = 'utf-8' if is_captcha(site.content): logger.warning(f'Captcha received for url: {url}') logger.warning(f'sleeping for {TIMEOUT_SEC * attempts}s...') sleep(TIMEOUT_SEC * attempts) return cache_html(url, name, attempts=attempts + 1) try: with open(Path(CACHED_FOLDER, name), 'wb') as out: out.write(site.content) except FileNotFoundError: import os os.mkdir(CACHED_FOLDER) with open(Path(CACHED_FOLDER, name), 'wb') as out: out.write(site.content) logger.info(f'Cache name: {name}') return site.content
def get_data(url, recreate_cache_forced=False, **kwargs): cache_dict = get_cache_dict() file_name = cache_dict.get(url, None) if not file_name: # file_name = '{}.html'.format(str(uuid4())[:8]) file_name = get_unique_file_name(CACHED_FOLDER, 'html') contents = cache_html(url, file_name) cache_dict[url] = file_name json.dump(cache_dict, open(CACHE_DICT_PATH, 'w')) else: file_path = Path(CACHED_FOLDER, file_name) try: if recreate_cache_forced: Path.unlink(file_path) logger.info(f'File: {file_path} removed') raise FileNotFoundError contents = open(file_path, 'r').read() logger.info(f'Using cached: {file_name} for url: {url}') except FileNotFoundError: del cache_dict[url] json.dump(cache_dict, open(CACHE_DICT_PATH, 'w')) return get_data(url) return contents
async def cache_html(url, name, attempts=1): if attempts > MAX_GET_ATTEMPTS: logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') logger.info(f'GET: {url}') if attempts > 1: logger.info(f'attempt: {attempts}') async with aiohttp.ClientSession() as session: site = await fetch(session, url) # print('SSSSSSSSSSSs', site) if is_captcha(site): logger.warning(f'Captcha received for url: {url}') logger.warning(f'Sleeping for {TIMEOUT_SEC * attempts}s...') await asyncio.sleep(TIMEOUT_SEC * attempts) return await cache_html(url, name, attempts=attempts + 1) with open(f'{CACHED_FOLDER}/{name}', 'w') as out: out.write(site) logger.info(f'Cache name: {name}') return site
download_item(i, category_name) def download_list_all(url_object): category_name, url = url_object.get('category_name'), url_object['url'] soup = get_soup(url) pages_count = get_pages_count(soup) logger.debug(f'Found {pages_count} pages for list url {url}') logger.debug('download_list::getting data from page 1') download_page(soup, category_name) for page_num in range(2, pages_count + 1): logger.debug(f'download_list::getting data from page {page_num}') new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num)) soup = get_soup(new_url) download_page(soup, category_name) if __name__ == '__main__': link_set = get_link_set() # link_set = json.load(open(LINK_SET_PATH, 'r', encoding='utf8')) logger.info('START\n') # random.shuffle(link_set) DOWNLOAD_IMAGES = input('DOWNLOAD_IMAGESs? (Y/N)\n').lower() == 'y' RESOLVE_OTHER_SHOP_URL = input('RESOLVE_OTHER_SHOP_URL? (Y/N)\n').lower() == 'y' while link_set: url_object_form_json = link_set.pop() logger.debug(f'url_object_form_json {str(url_object_form_json)}') download_list_all(url_object_form_json)