def compare_by_title(self, title: str) -> bool: request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = {'type': 'title', 'search_value': title} r = request_with_retries( urljoin(constants.main_url, 'Search'), request_dict, ) if not r: logger.info("Got no response from server") return False r.encoding = 'utf-8' soup_1 = BeautifulSoup(r.text, 'html.parser') matches_links = set() for gallery in soup_1.find_all( "div", class_=re.compile("showcase_comics_product_image_box")): link_container = gallery.find("a") if link_container: matches_links.add( urljoin(constants.main_url, link_container['href'])) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def compare_by_title(self, title: str) -> bool: request_dict = construct_request_dict(self.settings, self.own_settings) r = request_with_retries( urljoin(constants.main_url, 'search/') + quote(title), request_dict, ) if not r: logger.info("Got no response from server") return False r.encoding = 'utf-8' soup_1 = BeautifulSoup(r.text, 'html.parser') matches_links = set() # content-row manga row for gallery in soup_1.find_all("div", class_=re.compile("content-row")): link_container = gallery.find("a", class_="content-title") if link_container: matches_links.add(urljoin(constants.main_url, link_container['href'])) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def crawl_feed(self, feed_url: str = '') -> list[str]: urls: list[str] = [] if not feed_url: feed_url = constants.rss_url request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( feed_url, request_dict, post=False, ) if not response: logger.error("Got no response from feed URL: {}".format(feed_url)) return urls response.encoding = 'utf-8' feed = feedparser.parse( response.text ) for item in feed['items']: if any([item['title'].startswith(category) for category in self.own_settings.accepted_rss_categories]): urls.append(item['link']) return urls
def compare_by_title_search_page(self, title: str) -> bool: # https://www.fakku.net/search/+title full_url = urljoin(constants.main_url, 'search/+') + quote(title) logger.info("Querying URL: {}".format(full_url)) request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( full_url, request_dict, post=False, retries=3 ) if not response: logger.info("Got no response from server") return False response.encoding = 'utf-8' soup_1 = BeautifulSoup(response.text, 'html.parser') matches_links = set() for link_container in soup_1.find_all("div", class_=re.compile("content-meta")): title_container = link_container.find("a", class_="content-title") matches_links.add(urljoin(constants.main_url, title_container.get('href'))) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def compare_by_title(self, title: str) -> bool: request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = {'q': title} response = request_with_retries( constants.main_page, request_dict, post=False, ) if not response: return False response.encoding = 'utf-8' m = re.finditer(r'a href="/view/(\d+)/*"', response.text) matches_links = set() if m: for match in m: matches_links.add("{}{}".format( constants.gallery_container_url, match.group(1))) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def get_galleries_from_page_links(self, page_links: Iterable[str], page_links_results: List[DataDict]) -> None: api_page_links = [] for page_link in page_links: m = re.search(r'(.+)/s/(\w+)/(\d+)-(\d+)', page_link) if not m: continue api_page_links.append( {'data': [m.group(3), m.group(2), m.group(4)]}) api_page_links_chunks = list(chunks(api_page_links, 25)) for i, group in enumerate(api_page_links_chunks): if i % 3 == 2: time.sleep(self.settings.wait_timer) data = { 'method': 'gtoken', 'pagelist': [x['data'] for x in group]} headers = {'Content-Type': 'application/json'} response = request_with_retries( constants.ge_api_url, { 'data': json.dumps(data), 'headers': {**headers, **self.settings.requests_headers}, 'timeout': self.settings.timeout_timer }, post=True, logger=self.logger ) if not response: continue try: response_data = response.json() except(ValueError, KeyError): self.logger.error("Error parsing response to JSON: {}".format(response.text)) continue for gid_token_pair in response_data['tokenlist']: discard_approved, discard_message = self.discard_gallery_by_internal_checks( gallery_id=gid_token_pair['gid'], link=link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False) ) if discard_approved: if not self.settings.silent_processing: self.logger.info(discard_message) continue page_links_results.append( {'data': (gid_token_pair['gid'], gid_token_pair['token']), 'link': link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)})
def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]: request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: return None response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') if soup: title_jpn_match = soup.find("div", id=re.compile("info")).h2 gallery_id_match = re.search( r'{}(\d+)'.format(constants.gallery_container_url), link) if not gallery_id_match: return None gallery_id = 'nh-' + gallery_id_match.group(1) gallery = GalleryData(gallery_id, self.name) gallery.title = soup.h1.get_text() gallery.title_jpn = title_jpn_match.get_text( ) if title_jpn_match else '' gallery_filecount_match = re.search(r'<div>(\d+) page(s*)</div>', response.text) if gallery_filecount_match: gallery.filecount = int(gallery_filecount_match.group(1)) else: gallery.filecount = 0 gallery.tags = [] gallery.link = link gallery.posted = dateutil.parser.parse( soup.find("time")['datetime']) for tag_container in soup.find_all("a", {"class": "tag"}): tag_name = [text for text in tag_container.stripped_strings][0] tag_name = tag_name.split(" | ")[0] tag_scope = tag_container.parent.parent.get_text() tag_ext = tag_container.parent.get_text() tag_scope = tag_scope.replace(tag_ext, "").replace( "\t", "").replace("\n", "").replace(":", "").lower() if tag_scope == 'tags': gallery.tags.append(translate_tag(tag_name)) elif tag_scope == 'categories': gallery.category = tag_name.capitalize() else: gallery.tags.append( translate_tag(tag_scope + ":" + tag_name)) else: return None return gallery
def compare_by_title(self, gallery_title: str) -> bool: api_url = urljoin(self.own_settings.metadata_url, constants.api_path) logger.info("Querying URL: {}".format(api_url)) request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = {'match': True, 'title': gallery_title} response = request_with_retries( api_url, request_dict, post=False, retries=3 ) if not response: logger.info("Got no response from server") return False response_data = response.json() matches_links = set() if 'error' in response_data: logger.info("Got error from server: {}".format(response_data['error'])) return False for gallery in response_data: if 'link' in gallery: matches_links.add(gallery['link']) if 'gallery_container' in gallery and gallery['gallery_container']: if self.settings.gallery_model: gallery_container = self.settings.gallery_model.objects.filter( gid=gallery['gallery_container'], provider=gallery['provider'] ) first_gallery_container = gallery_container.first() if first_gallery_container: gallery['gallery_container_gid'] = first_gallery_container.gid if 'magazine' in gallery and gallery['magazine']: if self.settings.gallery_model: magazine = self.settings.gallery_model.objects.filter( gid=gallery['magazine'], provider=gallery['provider'] ) first_magazine = magazine.first() if first_magazine: gallery['magazine_gid'] = first_magazine.gid if 'posted' in gallery: if gallery['posted'] != 0: gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc) else: gallery['posted'] = None self.values_array.append(GalleryData(**gallery)) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def search_using_xml_api(self, title: str) -> bool: if not self.own_settings.api_key: logger.error("Can't use {} API without an api key. Check {}/API_MANUAL.txt".format( self.name, constants.main_page )) return False page = 1 galleries = [] while True: link = '{}/api/{}/?S=objectSearch&sn={}&page={}'.format( constants.main_page, self.own_settings.api_key, title, page ) request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: break response.encoding = 'utf-8' # Based on: https://www.doujinshi.org/API_MANUAL.txt api_galleries = convert_api_response_text_to_gallery_dicts(response.text) if not api_galleries: break galleries.extend(api_galleries) # API returns 25 max results per query, so if we get 24 or less, means there's no more pages. # API Manual says 25, but we get 50 results normally! if len(api_galleries) < 50: break page += 1 self.values_array = galleries self.gallery_links = [x.link for x in galleries if x.link] if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def get_image_link_from_tweet_text( tweet_text: str, settings: 'Settings') -> typing.Optional[str]: tweet_links = re.findall(r"https://t.co/\w+", tweet_text) for tweet_link in tweet_links: request_dict = { 'timeout': settings.timeout_timer, 'allow_redirects': False } r = request_with_retries( tweet_link, request_dict, post=False, ) if not r: return None if 'Location' in r.headers: if r.headers['Location'].startswith( 'https://www.wani.com/product/'): request_dict_image = { 'headers': settings.requests_headers, 'timeout': settings.timeout_timer, } product_page = request_with_retries( r.headers['Location'], request_dict_image, post=False, ) if not product_page: return None product_page.encoding = 'utf-8' soup = BeautifulSoup(product_page.text, 'html.parser') product_head = soup.find("head") if product_head: img_container = product_head.find("meta", property="og:image") if img_container: return img_container['content'] return None
def get_galleries_from_xml(self, url_group: Iterable[str]) -> list[GalleryData]: possible_gallery_ids = [ self.id_from_url(gallery_url) for gallery_url in url_group ] galleries_ids = [ gallery_id.replace('mugi-B', 'B') for gallery_id in possible_gallery_ids if gallery_id ] galleries = list() gallery_chunks = list(chunks(galleries_ids, 25)) for i, group in enumerate(gallery_chunks): logger.info( "Calling API ({}). Gallery group: {}, galleries in group: {}, total groups: {}" .format(self.name, i + 1, len(group), len(gallery_chunks))) # API doesn't say anything about needing to wait between requests, but we wait just in case. if i > 0: time.sleep(self.own_settings.wait_timer) link = constants.main_page + '/api/' + self.own_settings.api_key + '/?S=getID&ID=' + ",".join( galleries_ids) request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: continue response.encoding = 'utf-8' api_galleries = convert_api_response_text_to_gallery_dicts( response.text) if not api_galleries: continue galleries.extend(api_galleries) return galleries
def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]: request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: return None response.encoding = 'utf-8' return self.process_regular_gallery_page(link, response.text)
def compare_by_title_json(self, title: str) -> bool: # https://www.fakku.net/suggest/return%20of%20me headers = { 'Content-Type': 'application/json', 'Referer': constants.main_url + '/', 'X-Requested-With': 'XMLHttpRequest', } self.logger.info("Querying URL: {}".format( urljoin(constants.main_url, 'suggest/') + quote(title.lower()))) response = request_with_retries( urljoin(constants.main_url, 'suggest/') + quote(title.lower()), { 'headers': { **headers, **self.settings.requests_headers }, 'timeout': self.settings.timeout_timer }, post=False, retries=3, logger=self.logger) if not response: self.logger.info("Got no response from server") return False response_data = response.json() matches_links = set() if 'error' in response_data: self.logger.info("Got error from server: {}".format( response_data['error'])) return False for gallery in response_data: if gallery['type'] in ('doujinshi', 'manga', 'hentai'): matches_links.add(urljoin(constants.main_url, gallery['link'])) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]: fjord, gid, token = fjord_gid_token_from_link(link) if fjord is None or gid is None or token is None: return None if fjord: api_page = constants.ex_api_url else: api_page = constants.ge_api_url data = utilities.request_data_from_gid_token_iterable([(gid, token)]) headers = {'Content-Type': 'application/json'} response = request_with_retries(api_page, { 'data': json.dumps(data), 'headers': { **headers, **self.settings.requests_headers }, 'cookies': self.own_settings.cookies, 'timeout': self.settings.timeout_timer }, post=True, logger=self.logger) if not response: return None try: response_data = response.json() except (ValueError, KeyError): self.logger.error("Error parsing response to JSON: {}".format( response.text)) return None for gallery_data in response_data['gmetadata']: if 'error' in gallery_data: self.logger.error("Adding gallery {}: " "failed with error: {}".format( gallery_data['gid'], gallery_data['error'])) return None internal_gallery_data = map_external_gallery_data_to_internal( gallery_data) return internal_gallery_data return None
def start_download(self) -> None: if not self.gallery or not self.gallery.temp_archive: return logger.info( "Downloading an archive: {} from a Panda Backup-like source: {}". format(self.gallery.title, self.gallery.temp_archive['link'])) to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, to_use_filename + '.zip')) # TODO: File could be cbz. request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['stream'] = True request_file = request_with_retries( self.gallery.temp_archive['link'], request_dict, ) if not request_file: logger.error("Could not download archive") self.return_code = 0 return filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def request_torrent_download( self, root: str, gid: str, token: str) -> Optional[requests.models.Response]: url = root + '/gallerytorrents.php' params = {'gid': gid, 't': token} request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = params response = request_with_retries( url, request_dict, post=True, ) return response
def request_torrent_download( self, root: str, gid: str, token: str) -> Optional[requests.models.Response]: url = root + '/gallerytorrents.php' params = {'gid': gid, 't': token} response = request_with_retries(url, { 'params': params, 'cookies': self.own_settings.cookies, 'headers': self.settings.requests_headers, 'timeout': self.settings.timeout_timer }, post=True, logger=self.logger) return response
def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]: link_root, gid, token = root_gid_token_from_link(link) if link_root is None or gid is None or token is None: return None if self.own_settings.use_ex_for_fjord and self.own_settings.cookies and link_root == constants.ex_api_url: api_page = constants.ex_api_url else: api_page = constants.ge_api_url data = utilities.request_data_from_gid_token_iterable([(gid, token)]) headers = {'Content-Type': 'application/json'} request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['headers'] = {**headers, **self.settings.requests_headers} request_dict['data'] = json.dumps(data) response = request_with_retries( api_page, request_dict, post=True, ) if not response: return None try: response_data = response.json() except(ValueError, KeyError): logger.error("Could not parse response to JSON: {}".format(response.text)) return None for gallery_data in response_data['gmetadata']: if 'error' in gallery_data: logger.error( "Adding gallery {}: " "failed with error: {}".format(gallery_data['gid'], gallery_data['error']) ) return None internal_gallery_data = map_external_gallery_data_to_internal(gallery_data) return internal_gallery_data return None
def request_archive_download( self, root: str, gid: str, token: str, key: str) -> Optional[requests.models.Response]: url = root + '/archiver.php' params = {'gid': gid, 'token': token, 'or': key} request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = params request_dict['data'] = constants.archive_download_data response = request_with_retries( url, request_dict, post=True, ) return response
def request_archive_download( self, root: str, gid: str, token: str, key: str) -> Optional[requests.models.Response]: url = root + '/archiver.php' params = {'gid': gid, 'token': token, 'or': key} response = request_with_retries(url, { 'params': params, 'cookies': self.own_settings.cookies, 'data': constants.archive_download_data, 'headers': self.settings.requests_headers, 'timeout': self.settings.timeout_timer }, post=True, logger=self.logger) return response
def crawl_feed(self, feed_url: str = None) -> List[str]: urls: List[str] = [] if not feed_url: feed_url = constants.rss_url response = request_with_retries(feed_url, { 'headers': self.settings.requests_headers, 'timeout': self.settings.timeout_timer, }, post=False, logger=self.logger) if not response: self.logger.error( "No response from URL: {}, returning".format(feed_url)) return urls response.encoding = 'utf-8' match_string = re.compile(r"/view/(\d+)/*$") soup = BeautifulSoup(response.text, 'html.parser') content_container = soup.find("div", class_="columns") if not content_container: self.logger.error("Content container not found, returning") return urls url_containers = content_container.find_all("a", href=match_string) for url_container in url_containers: url_link = url_container.get('href') complete_url = "{}{}".format(constants.main_page, url_link) urls.append(complete_url) return urls
def compare_by_title(self, title: str) -> bool: request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = {'q': title} r = request_with_retries( urljoin(constants.main_url, 'search/'), request_dict, ) if not r: logger.info("Got no response from server") return False r.encoding = 'utf-8' soup_1 = BeautifulSoup(r.text, 'html.parser') matches_links = set() discard_titles = ["(SPANISH)", "(FRENCH)"] for link_container in soup_1.find_all( "a", class_=re.compile("product-card")): # Discard spanish/french releases title_container = link_container.find("div", class_="product-card__name") if any(x for x in discard_titles if title_container.get_text().startswith(x)): continue matches_links.add( urljoin( constants.main_url, urljoin(link_container['href'], urlparse(link_container['href']).path))) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def crawl_feed(self, feed_url: str = '') -> list[ChaikaGalleryData]: request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( feed_url, request_dict, post=False, ) dict_list = [] if not response: return [] try: json_decoded = response.json() except (ValueError, KeyError): logger.error("Could not parse response to JSON: {}".format( response.text)) return [] if type(json_decoded) == dict: if 'galleries' in json_decoded: dict_list = json_decoded['galleries'] else: dict_list.append(json_decoded) elif type(json_decoded) == list: dict_list = json_decoded total_galleries_filtered: list[ChaikaGalleryData] = [] for gallery in dict_list: if 'result' in gallery: continue gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc) gallery_data = ChaikaGalleryData(**gallery) total_galleries_filtered.append(gallery_data) return total_galleries_filtered
def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]: request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: return None response.encoding = 'utf-8' new_text = re.sub(r'(<div class="right">\d+?)</b>', r'\1', response.text) if constants.main_url + '/magazines/' in link: return self.process_magazine_page(link, new_text) else: return self.process_regular_gallery_page(link, new_text)
def parse_posted_date_from_feed(self, link: str, gid: str) -> Optional[datetime]: request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: return None response.encoding = 'utf-8' feed = feedparser.parse(response.text) for item in feed['items']: if gid in item['id']: return date_parser.parse( item['published'], tzinfos=constants.extra_feed_url_timezone) return None
def compare_by_title(self, title: str) -> bool: headers = {'Content-Type': 'application/json'} api_link = constants.posts_api_url payload = {'search': title} response = request_with_retries(api_link, { 'headers': { **headers, **self.settings.requests_headers }, 'timeout': self.settings.timeout_timer, 'params': payload }, post=False, logger=self.logger) if not response: return False response.encoding = 'utf-8' try: response_data = response.json() except (ValueError, KeyError): return False matches_links = set() for gallery in response_data: matches_links.add(gallery['link']) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False
def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]: request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: return None response.encoding = 'utf-8' match_string = re.compile(constants.main_page + '/(.+)/$') tags = [] soup = BeautifulSoup(response.text, 'html.parser') content_container = soup.find("div", class_="content") if not content_container: return None artists_container = content_container.find_all( "a", href=re.compile(constants.main_page + '/artist/.*/$')) for artist in artists_container: tags.append("artist:{}".format(artist.get_text())) tags_container = content_container.find_all( "a", href=re.compile(constants.main_page + '/tag/.*/$')) for tag in tags_container: tags.append(tag.get_text()) # thumbnail_small_container = content_container.find("img") # if thumbnail_small_container: # thumbnail_url = thumbnail_small_container.get('src') thumbnail_url = soup.find("meta", property="og:image") match_result = match_string.match(soup.find("meta", property="og:link")) if not match_result: return None gallery = GalleryData( match_result.group(1), self.name, link=link, title=soup.find("meta", property="og:title"), comment='', thumbnail_url=thumbnail_url, category='Manga', uploader='', posted=None, filecount=0, filesize=0, expunged=False, rating='', tags=translate_tag_list(tags), content=content_container.encode_contents(), ) return gallery
def wanted_generator(settings: 'Settings', attrs: 'AttributeManager'): own_settings = settings.providers[constants.provider_name] if not own_settings.api_key: logger.error( "Can't use {} API without an api key. Check {}/API_MANUAL.txt". format(constants.provider_name, constants.main_page)) return False queries: DataDict = {} queries_slist_params: DataDict = {} for attr in attrs.filter(name__startswith='wanted_params_'): attr_info = attr.name.replace('wanted_params_', '') query_name, attr_name = attr_info.split("_", maxsplit=1) if query_name not in queries: queries[query_name] = { 'page': 1, 'S': 'objectSearch', 'match': 0, 'order': 'added', 'flow': 'DESC' } if attr_name.startswith('slist_'): if query_name not in queries_slist_params: queries_slist_params[query_name] = [] queries_slist_params[query_name].append('{}:{}'.format( attr_name.replace('slist_', ''), attr.value)) else: queries[query_name].update({attr_name: attr.value}) for query_name, slist_params in queries_slist_params.items(): queries[query_name].update({'slist': '|'.join(slist_params)}) for query_name, query_values in queries.items(): while True: # Read the values from the newly created Provider Model, # that should be created like this (extracted from from): # wanted_params_match: Any, Sounds Like, Start With, End With, Exact -> 0, 4, 1, 2, 3 # wanted_params_age: 18+ -> blank/Y/N # wanted_params_anth: Anthology -> blank/Y/N # wanted_params_bcopy: Copybook -> blank/Y/N # wanted_params_FREE: Free -> blank/Y/N # wanted_params_flist: Type -> # blank: Any # 19: Bootleg # 18: Calendar # 12: Commercial Artbook # 8: Commercial CG # 7: Commercial Magazine # 25: Commercial Mook # 11: Commercial Novel # 10: Commercial other # 13: Commercial other book # 9: Commercial Soft # 2: Doujin CG # 24: Doujin Goods # 23: Doujin Movie # 22: Doujin Music # 21: Doujin Novel # 4: Doujin Other # 3: Doujin Soft # 1: Doujinshi # 5: Manga # 6: Manga (Part) # 17: Postcard # 16: Poster # 15: Shitajiki # 14: Telephone Card # 20: Unknown # wanted_params_date: Release date from -> yyyy-mm-dd # wanted_params_date2: Release date to -> yyyy-mm-dd # for slist parameters: # Here is the list of ALL search terms: # C: Circle # A: Author # P: Parody # H: Character # N: Convention # O: Collections # K: Content # G: Genre # T: Type # L: Publisher # I: Imprint # wanted_params_slist_C: Separated by | # wanted_params_slist_A: Separated by | # wanted_params_slist_P: Separated by | # wanted_params_slist_H: Separated by | # wanted_params_slist_K: Separated by | # wanted_params_slist_G: Separated by | # wanted_params_slist_N: One # wanted_params_slist_O: One # wanted_params_slist_L: One # wanted_params_slist_I: One # wanted_params_cont: One # wanted_params_sub: One # wanted_params_scen: Censored -> blank/Y/N new_query = urllib.parse.urlencode(query_values, doseq=True) logger.info( 'Querying {} for auto wanted galleries, page: {}, query name: {}, query: {}' .format(constants.provider_name, query_values['page'], query_name, new_query)) link = '{}/api/{}/?{}'.format(constants.main_page, own_settings.api_key, new_query) provider, provider_created = Provider.objects.get_or_create( slug=constants.provider_name, defaults={'name': constants.provider_name}) remaining_queries, int_created = attrs.get_or_create( provider=provider, name='remaining_queries', data_type='int', defaults={ 'value_int': constants.daily_requests, }) last_query_date, date_created = attrs.get_or_create( provider=provider, name='last_query_date', data_type='date', defaults={ 'value_date': django_tz.now(), }) if not date_created: limit_time = datetime.time(tzinfo=datetime.timezone( datetime.timedelta( hours=1))) # GMT+1 is when server resets if last_query_date.value.timetz() < limit_time < django_tz.now( ).timetz(): remaining_queries.value = constants.daily_requests remaining_queries.save() if remaining_queries.value <= 0: logger.warning( "Daily queries quota {} reached for {}. It resets at 00:00 GMT+1" .format(constants.daily_requests, constants.provider_name)) return request_dict = construct_request_dict(settings, own_settings) response = request_with_retries( link, request_dict, post=False, ) remaining_queries.value -= 1 remaining_queries.save() last_query_date.value = django_tz.now() last_query_date.save() if not response: logger.error( 'For provider {}: Got to page {}, but did not get a response, stopping' .format(constants.provider_name, query_values['page'])) break response.encoding = 'utf-8' # Based on: https://www.doujinshi.org/API_MANUAL.txt api_galleries = convert_api_response_text_to_gallery_dicts( response.text) if not api_galleries: logger.error('Server response: {}'.format(response.text)) logger.error( 'For provider {}: Got to page {}, but could not parse the response into galleries, stopping' .format(constants.provider_name, query_values['page'])) break # Listen to what the server says remaining_queries.value = api_galleries[0].queries remaining_queries.save() used = Gallery.objects.filter( gid__in=[x.gid for x in api_galleries], provider=constants.provider_name) # If the amount of galleries present in database is equal to what we get from the page, # we assume we already processed everything. You can force to process everything by using: force_process, force_created = attrs.get_or_create( provider=provider, name='force_process', data_type='bool', defaults={ 'value_bool': False, }) logger.info( 'For provider {}: Page has {} galleries, from which {} are already present in the database.' .format(constants.provider_name, len(api_galleries), used.count())) if not force_process.value and used.count() == len(api_galleries): logger.info( 'For provider {}: Got to page {}, it has already been processed entirely, stopping' .format(constants.provider_name, query_values['page'])) break used_gids = used.values_list('gid', flat=True) for gallery_data in api_galleries: if gallery_data.gid not in used_gids: if not gallery_data.dl_type: gallery_data.dl_type = 'auto_wanted' wanted_reason = attrs.fetch_value( 'wanted_reason_{}'.format(query_name)) if isinstance(wanted_reason, str): gallery_data.reason = wanted_reason or 'backup' gallery = Gallery.objects.add_from_values(gallery_data) # We match anyways in case there's a previous WantedGallery. # Actually, we don't match since we only get metadata here, so it should not count as found. publisher_name = '' publisher = gallery.tags.filter(scope='publisher').first() if publisher: publisher_name = publisher.name if not gallery.title_jpn: continue search_title = format_title_to_wanted_search( gallery.title_jpn) wanted_galleries: typing.Iterable[ WantedGallery] = WantedGallery.objects.filter( title_jpn=gallery.title_jpn, search_title=search_title) if not wanted_galleries: wanted_gallery = WantedGallery.objects.create( title=gallery.title or gallery.title_jpn, title_jpn=gallery.title_jpn, search_title=search_title, book_type=gallery.category, page_count=gallery.filecount, publisher=publisher_name, add_as_hidden=True, reason=attrs.fetch_value( 'wanted_reason_{}'.format(query_name)) or '', public=attrs.fetch_value( 'wanted_public_{}'.format(query_name)) or False, should_search=attrs.fetch_value( 'wanted_should_search_{}'.format(query_name)) or True, keep_searching=attrs.fetch_value( 'wanted_keep_searching_{}'.format(query_name)) or True, category='Manga', unwanted_title=own_settings.unwanted_title or settings.auto_wanted.unwanted_title) wanted_provider_string = attrs.fetch_value( 'wanted_provider_{}'.format(query_name)) if wanted_provider_string and isinstance( wanted_provider_string, str): wanted_provider_instance = Provider.objects.filter( slug=wanted_provider_string).first() if wanted_provider_instance: wanted_gallery.wanted_providers.add( wanted_provider_instance) wanted_providers_string = attrs.fetch_value( 'wanted_providers_{}'.format(query_name)) if wanted_providers_string and isinstance( wanted_providers_string, str): for wanted_provider in wanted_providers_string.split( ): wanted_provider = wanted_provider.strip() wanted_provider_instance = Provider.objects.filter( slug=wanted_provider).first() if wanted_provider_instance: wanted_gallery.wanted_providers.add( wanted_provider_instance) for artist in gallery.tags.filter(scope='artist'): artist_obj = Artist.objects.filter( name_jpn=artist.name).first() if not artist_obj: artist_obj = Artist.objects.create( name=artist.name, name_jpn=artist.name) wanted_gallery.artists.add(artist_obj) logger.info( "Created wanted gallery ({}): {}, search title: {}" .format(wanted_gallery.book_type, wanted_gallery.get_absolute_url(), gallery.title_jpn)) wanted_galleries = [wanted_gallery] for wanted_gallery in wanted_galleries: mention, mention_created = wanted_gallery.mentions.get_or_create( mention_date=gallery.create_date, release_date=gallery.posted, type='release_date', source=constants.provider_name, ) if mention_created and gallery.thumbnail: mention.copy_img(gallery.thumbnail.path) wanted_gallery.calculate_nearest_release_date() # galleries.extend(api_galleries) # API returns 25 max results per query, so if we get 24 or less, means there's no more pages. # API Manual says 25, but we get 50 results normally! if len(api_galleries) < 50: logger.info( 'Got to page {}, and we got less than 50 galleries, ' 'meaning there is no more pages, stopping'.format( query_values['page'])) break query_values['page'] += 1 logger.info("{} Auto wanted ended.".format(constants.provider_name))
def crawl_feed(self, feed_url: str = '') -> list[GalleryData]: if not feed_url: feed_url = constants.rss_url request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( feed_url, request_dict, post=False, ) if not response: logger.error("Got no response from feed URL: {}".format(feed_url)) return [] response.encoding = 'utf-8' feed = feedparser.parse(response.text) galleries = [] match_string = re.compile(constants.main_page + '/(.+)/$') skip_tags = ['Uncategorized'] logger.info( "Provided RSS URL for provider ({}), adding {} found links".format( self.name, len(feed['items']))) for item in feed['items']: tags = [x.term for x in item['tags'] if x.term not in skip_tags] thumbnail_url = '' for content in item['content']: soup = BeautifulSoup(content.value, 'html.parser') artists_container = soup.find_all( "a", href=re.compile(constants.main_page + '/artist/.*/$')) for artist in artists_container: tags.append("artist:{}".format(artist.get_text())) thumbnail_small_container = soup.find("img") if thumbnail_small_container: thumbnail_url = thumbnail_small_container.get('src') match_result = match_string.match(item['link']) if not match_result: continue gallery = GalleryData(match_result.group(1), self.name, title=item['title'], comment=item['description'], thumbnail_url=thumbnail_url, category='Manga', uploader=item['author'], posted=datetime.strptime( item['published'], "%a, %d %b %Y %H:%M:%S %z"), filecount=0, filesize=0, expunged=False, rating='', tags=translate_tag_list(tags), content=item['content'][0].value, link=item['link']) # Must check here since this method is called after the main check in crawl_urls if self.general_utils.discard_by_tag_list(gallery.tags): continue if not gallery.link: continue discard_approved, discard_message = self.discard_gallery_by_internal_checks( gallery.gid, link=gallery.link) if discard_approved: if not self.settings.silent_processing: logger.info(discard_message) continue galleries.append(gallery) return galleries
def get_values_from_gallery_link_json(self, link: str) -> Optional[GalleryData]: match_string = re.compile(constants.main_page + '/(.+)/$') m = match_string.match(link) if m: gallery_slug = m.group(1) else: return None api_link = constants.posts_api_url request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = {'slug': gallery_slug} response = request_with_retries( api_link, request_dict, post=False, ) if not response: return None response.encoding = 'utf-8' try: response_data = response.json() except (ValueError, KeyError): logger.error( "Could not parse response to JSON: {}".format(api_link)) return None tags = [] thumbnail_url = '' if len(response_data) < 1: return None api_gallery = response_data[0] soup = BeautifulSoup(api_gallery['content']['rendered'], 'html.parser') artists_container = soup.find_all("a", href=re.compile(constants.main_page + '/artist/.*/$')) for artist in artists_container: tags.append("artist:{}".format(artist.get_text())) tags_container = soup.find_all("a", href=re.compile(constants.main_page + '/tag/.*/$')) for tag in tags_container: tags.append(tag.get_text()) thumbnail_small_container = soup.find("img") if thumbnail_small_container: thumbnail_url = thumbnail_small_container.get('src') gallery = GalleryData( gallery_slug, self.name, link=link, title=unescape(api_gallery['title']['rendered']), comment='', thumbnail_url=thumbnail_url, category='Manga', uploader='', posted=datetime.strptime(api_gallery['date_gmt'] + '+0000', "%Y-%m-%dT%H:%M:%S%z"), filecount=0, filesize=0, expunged=False, rating='', tags=translate_tag_list(tags), content=api_gallery['content']['rendered'], ) return gallery