def get_galleries_from_page_links(self, page_links: Iterable[str], page_links_results: List[DataDict]) -> None: api_page_links = [] for page_link in page_links: m = re.search(r'(.+)/s/(\w+)/(\d+)-(\d+)', page_link) if not m: continue api_page_links.append( {'data': [m.group(3), m.group(2), m.group(4)]}) api_page_links_chunks = list(chunks(api_page_links, 25)) for i, group in enumerate(api_page_links_chunks): if i % 3 == 2: time.sleep(self.settings.wait_timer) data = { 'method': 'gtoken', 'pagelist': [x['data'] for x in group]} headers = {'Content-Type': 'application/json'} response = request_with_retries( constants.ge_api_url, { 'data': json.dumps(data), 'headers': {**headers, **self.settings.requests_headers}, 'timeout': self.settings.timeout_timer }, post=True, logger=self.logger ) if not response: continue try: response_data = response.json() except(ValueError, KeyError): self.logger.error("Error parsing response to JSON: {}".format(response.text)) continue for gid_token_pair in response_data['tokenlist']: discard_approved, discard_message = self.discard_gallery_by_internal_checks( gallery_id=gid_token_pair['gid'], link=link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False) ) if discard_approved: if not self.settings.silent_processing: self.logger.info(discard_message) continue page_links_results.append( {'data': (gid_token_pair['gid'], gid_token_pair['token']), 'link': link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)})
def get_galleries_from_xml(self, url_group: Iterable[str]) -> list[GalleryData]: possible_gallery_ids = [ self.id_from_url(gallery_url) for gallery_url in url_group ] galleries_ids = [ gallery_id.replace('mugi-B', 'B') for gallery_id in possible_gallery_ids if gallery_id ] galleries = list() gallery_chunks = list(chunks(galleries_ids, 25)) for i, group in enumerate(gallery_chunks): logger.info( "Calling API ({}). Gallery group: {}, galleries in group: {}, total groups: {}" .format(self.name, i + 1, len(group), len(gallery_chunks))) # API doesn't say anything about needing to wait between requests, but we wait just in case. if i > 0: time.sleep(self.own_settings.wait_timer) link = constants.main_page + '/api/' + self.own_settings.api_key + '/?S=getID&ID=' + ",".join( galleries_ids) request_dict = construct_request_dict(self.settings, self.own_settings) response = request_with_retries( link, request_dict, post=False, ) if not response: continue response.encoding = 'utf-8' api_galleries = convert_api_response_text_to_gallery_dicts( response.text) if not api_galleries: continue galleries.extend(api_galleries) return galleries
def crawl_json(self, json_string: str, wanted_filters: QuerySet = None, wanted_only: bool = False) -> None: if not self.settings.gallery_model: return dict_list = [] json_decoded = json.loads(json_string) if type(json_decoded) == dict: dict_list.append(json_decoded) elif type(json_decoded) == list: dict_list = json_decoded galleries_gids = [] found_galleries = set() total_galleries_filtered: List[GalleryData] = [] gallery_wanted_lists: Dict[str, List['WantedGallery']] = defaultdict(list) for gallery in dict_list: galleries_gids.append(gallery['gid']) gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc) gallery_data = GalleryData(**gallery) total_galleries_filtered.append(gallery_data) for galleries_gid_group in list(chunks(galleries_gids, 900)): for found_gallery in self.settings.gallery_model.objects.filter( gid__in=galleries_gid_group): discard_approved, discard_message = self.discard_gallery_by_internal_checks( gallery=found_gallery, link=found_gallery.get_link()) if discard_approved: self.logger.info(discard_message) found_galleries.add(found_gallery.gid) for count, gallery in enumerate(total_galleries_filtered): if gallery.gid in found_galleries: continue if self.general_utils.discard_by_tag_list(gallery.tags): self.logger.info( "Gallery {} of {}: Skipping gallery {}, because it's tagged with global discarded tags" .format(count, len(total_galleries_filtered), gallery.title)) continue if wanted_filters: self.compare_gallery_with_wanted_filters( gallery, gallery.link, wanted_filters, gallery_wanted_lists) if wanted_only and not gallery_wanted_lists[gallery.gid]: continue self.logger.info( "Gallery {} of {}: Gallery {} will be processed.".format( count, len(total_galleries_filtered), gallery.title)) if gallery.thumbnail: original_thumbnail_url = gallery.thumbnail_url gallery.thumbnail_url = gallery.thumbnail gallery_instance = self.settings.gallery_model.objects.update_or_create_from_values( gallery) gallery_instance.thumbnail_url = original_thumbnail_url gallery_instance.save() else: self.settings.gallery_model.objects.update_or_create_from_values( gallery)
def crawl_urls(self, urls: List[str], wanted_filters: QuerySet = None, wanted_only: bool = False) -> None: unique_urls = set() gallery_data_list = [] fetch_format_galleries: List[DataDict] = [] unique_page_urls = set() gallery_wanted_lists: Dict[str, List['WantedGallery']] = defaultdict(list) if not self.downloaders: self.logger.warning('No downloaders enabled, returning.') return for url in urls: if constants.rss_url in url: feed_links = self.crawl_feed(url) unique_urls.update(feed_links) self.logger.info( "Provided RSS URL for provider ({}), adding {} found links" .format(self.name, len(feed_links))) continue if (constants.ex_page_short not in url and constants.ge_page_short not in url): self.logger.warning("Invalid URL, skipping: {}".format(url)) continue if '/g/' in url: if not self.settings.silent_processing: self.logger.info( "Provided URL {} is a gallery link, adding".format( url)) unique_urls.add(url) continue if '/s/' in url: if not self.settings.silent_processing: self.logger.info( "Provided URL {} is a page link, adding".format(url)) unique_page_urls.add(url) continue # Do not crawl main page links if they were submitted anonymously, to prevent spam. if len(self.downloaders ) == 1 and self.downloaders[0][0].type == 'submit': continue # assuming main page URLs unique_urls.update(self.get_galleries_from_main_page_link(url)) gallery_ids = [] found_galleries = set() total_galleries_filtered = [] for gallery_url in unique_urls: m = re.search(r'(.+)/g/(\d+)/(\w+)', gallery_url) if m: gallery_ids.append(m.group(2)) total_galleries_filtered.append( (gallery_url, m.group(1), m.group(2), m.group(3))) for galleries_gid_group in list(chunks(gallery_ids, 900)): for found_gallery in Gallery.objects.filter( gid__in=galleries_gid_group): discard_approved, discard_message = self.discard_gallery_by_internal_checks( gallery=found_gallery, link=found_gallery.get_link()) if discard_approved: if not self.settings.silent_processing: self.logger.info(discard_message) found_galleries.add(found_gallery.gid) for gallery_tuple in total_galleries_filtered: if gallery_tuple[2] not in found_galleries: fetch_format_galleries.append({ 'data': (gallery_tuple[2], gallery_tuple[3]), 'root': gallery_tuple[1], 'link': gallery_tuple[0] }) if not self.settings.silent_processing: self.logger.info("Gallery {} will be processed. " "Total galleries: {}".format( gallery_tuple[0], len(fetch_format_galleries))) if len(unique_page_urls) > 0: self.logger.info("Getting gallery links from page links...") page_links_results: List[DataDict] = [] self.get_galleries_from_page_links(unique_page_urls, page_links_results) fetch_format_galleries += page_links_results if len(fetch_format_galleries) == 0: self.logger.info("No galleries need downloading, returning.") return fetch_format_galleries_chunks = list(chunks(fetch_format_galleries, 25)) fjord_galleries = [] for i, group in enumerate(fetch_format_galleries_chunks): # Set based on recommendation in official documentation if i % 3 == 2: time.sleep(self.settings.wait_timer) if not self.settings.silent_processing: self.logger.info( "Calling non-fjord API ({}). " "Gallery group: {}, galleries in group: {}, total groups: {}" .format(self.name, i + 1, len(group), len(fetch_format_galleries_chunks))) data = utilities.request_data_from_gid_token_iterable( [x['data'] for x in group]) headers = {'Content-Type': 'application/json'} response = request_with_retries( constants.ge_api_url, { 'data': json.dumps(data), 'headers': { **headers, **self.settings.requests_headers }, 'timeout': self.settings.timeout_timer }, post=True, logger=self.logger) if not response: continue try: response_data = response.json() except (ValueError, KeyError): self.logger.error("Error parsing response to JSON: {}".format( response.text)) continue for gallery_data in response_data['gmetadata']: if 'error' in gallery_data: self.logger.error("Adding gallery {}: " "failed with error: {}".format( gallery_data['gid'], gallery_data['error'])) continue internal_gallery_data = map_external_gallery_data_to_internal( gallery_data) link = link_from_gid_token_fjord(gallery_data['gid'], gallery_data['token'], False) if self.general_utils.discard_by_tag_list( internal_gallery_data.tags): if not self.settings.silent_processing: self.logger.info( "Skipping gallery {}, because it's tagged with global discarded tags" .format(link)) continue if wanted_filters: self.compare_gallery_with_wanted_filters( internal_gallery_data, link, wanted_filters, gallery_wanted_lists) if wanted_only and not gallery_wanted_lists[ internal_gallery_data.gid]: continue m = re.search(constants.default_fjord_tags, ",".join(internal_gallery_data.tags)) if m and self.own_settings.cookies: fjord_galleries.append( link_from_gid_token_fjord(gallery_data['gid'], gallery_data['token'], True)) else: gallery_data_list.append(internal_gallery_data) fjord_galleries_data = self.fetch_multiple_gallery_data( fjord_galleries) if fjord_galleries_data: gallery_data_list.extend(fjord_galleries_data) self.pass_gallery_data_to_downloaders(gallery_data_list, gallery_wanted_lists)
def get_values_from_gallery_link_list( self, url_list: Iterable[str]) -> List[GalleryData]: gid_token_chunks = list( chunks([get_gid_token_from_link(link) for link in url_list], 25)) galleries_data = [] for i, group in enumerate(gid_token_chunks): if i % 3 == 2: time.sleep(self.settings.wait_timer) if not self.settings.silent_processing: self.logger.info( "Calling fjord API ({}). " "Gallery group: {}, galleries in group: {}, total groups: {}" .format(self.name, i + 1, len(group), len(gid_token_chunks))) data = utilities.request_data_from_gid_token_iterable(group) headers = {'Content-Type': 'application/json'} response = request_with_retries( constants.ex_api_url, { 'data': json.dumps(data), 'headers': { **headers, **self.settings.requests_headers }, 'cookies': self.own_settings.cookies, 'timeout': self.settings.timeout_timer }, post=True, logger=self.logger) if not response: continue try: response_data = response.json() except (ValueError, KeyError): self.logger.error("Error parsing response to JSON: {}".format( response.text)) continue for gallery_data in response_data['gmetadata']: if 'error' in gallery_data: self.logger.error("Fetching gallery {}: " "failed with error: {}".format( gallery_data['gid'], gallery_data['error'])) continue internal_gallery_data = map_external_gallery_data_to_internal( gallery_data) m = re.search(constants.default_fjord_tags, ",".join(internal_gallery_data.tags)) if m: internal_gallery_data.fjord = True else: internal_gallery_data.fjord = False galleries_data.append(internal_gallery_data) return galleries_data
def get_values_from_gallery_link_list(self, url_list: Iterable[str], use_fjord: bool = False) -> list[GalleryData]: gid_token_chunks = list(chunks([get_gid_token_from_link(link) for link in url_list], 25)) galleries_data = [] if self.own_settings.cookies and use_fjord: api_page = constants.ex_api_url else: api_page = constants.ge_api_url for i, group in enumerate(gid_token_chunks): if i % 3 == 2: time.sleep(self.own_settings.wait_timer) if not self.settings.silent_processing: logger.info( "Calling API ({}), URL: {}. " "Gallery group: {}, galleries in group: {}, total groups: {}".format( self.name, api_page, i + 1, len(group), len(gid_token_chunks) ) ) data = utilities.request_data_from_gid_token_iterable(group) headers = {'Content-Type': 'application/json'} request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['headers'] = {**headers, **self.settings.requests_headers} request_dict['data'] = json.dumps(data) response = request_with_retries( api_page, request_dict, post=True, ) if not response: continue try: response_data = response.json() except(ValueError, KeyError): logger.error("Could not parse response to JSON: {}".format(response.text)) continue for gallery_data in response_data['gmetadata']: if 'error' in gallery_data: logger.error( "Fetching gallery {}: " "failed with error: {}".format(gallery_data['gid'], gallery_data['error']) ) continue internal_gallery_data = map_external_gallery_data_to_internal(gallery_data) if use_fjord and internal_gallery_data.fjord: internal_gallery_data.root = constants.ex_page internal_gallery_data.link = link_from_gid_token_fjord( gallery_data['gid'], gallery_data['token'], True ) else: internal_gallery_data.root = constants.ge_page internal_gallery_data.link = link_from_gid_token_fjord( gallery_data['gid'], gallery_data['token'], False ) galleries_data.append(internal_gallery_data) return galleries_data
def crawl_urls(self, urls: list[str], wanted_filters=None, wanted_only: bool = False) -> None: for url in urls: dict_list = [] request_dict = construct_request_dict(self.settings, self.own_settings) if '/archive/' in url: match_archive_pk = re.search(r'/archive/(\d+)/', url) if match_archive_pk: api_url = urljoin(self.own_settings.url, constants.api_path) request_dict['params'] = { 'archive': match_archive_pk.group(1) } archive_response = request_with_retries( api_url, request_dict, post=False, ) if not archive_response: logger.error( "Did not get a response from URL: {}".format( api_url)) continue try: json_decoded = archive_response.json() except (ValueError, KeyError): logger.error( "Could not parse response to JSON: {}".format( archive_response.text)) continue if json_decoded['gallery']: request_dict['params'] = { 'gd': json_decoded['gallery'] } gallery_response = request_with_retries( api_url, request_dict, post=False, ) if not gallery_response: logger.error( "Did not get a response from URL: {}".format( api_url)) continue try: json_decoded = gallery_response.json() dict_list.append(json_decoded) except (ValueError, KeyError): logger.error( "Could not parse response to JSON: {}".format( gallery_response.text)) continue else: logger.error( "Archive: {} does not have an associated Gallery". format(url)) continue elif '/gallery/' in url: match_gallery_pk = re.search(r'/gallery/(\d+)/', url) if match_gallery_pk: api_url = urljoin(self.own_settings.url, constants.api_path) request_dict['params'] = {'gd': match_gallery_pk.group(1)} gallery_response = request_with_retries( api_url, request_dict, post=False, ) if not gallery_response: logger.error( "Did not get a response from URL: {}".format( api_url)) continue try: json_decoded = gallery_response.json() dict_list.append(json_decoded) except (ValueError, KeyError): logger.error( "Could not parse response to JSON: {}".format( gallery_response.text)) continue else: response = request_with_retries( url, request_dict, post=False, ) if not response: logger.error( "Did not get a response from URL: {}".format(url)) continue try: json_decoded = response.json() except (ValueError, KeyError): logger.error("Could not parse response to JSON: {}".format( response.text)) continue if type(json_decoded) == dict: if 'galleries' in json_decoded: dict_list = json_decoded['galleries'] else: dict_list.append(json_decoded) elif type(json_decoded) == list: dict_list = json_decoded galleries_gids = [] found_galleries = set() total_galleries_filtered: list[ChaikaGalleryData] = [] gallery_wanted_lists: dict[ str, list['WantedGallery']] = defaultdict(list) for gallery in dict_list: if 'result' in gallery: continue galleries_gids.append(gallery['gid']) gallery['posted'] = datetime.fromtimestamp( int(gallery['posted']), timezone.utc) gallery_data = ChaikaGalleryData(**gallery) total_galleries_filtered.append(gallery_data) for galleries_gid_group in list(chunks(galleries_gids, 900)): for found_gallery in Gallery.objects.filter( gid__in=galleries_gid_group): discard_approved, discard_message = self.discard_gallery_by_internal_checks( gallery=found_gallery, link=found_gallery.get_link()) if discard_approved: logger.info("{} Real GID: {}".format( discard_message, found_gallery.gid)) found_galleries.add(found_gallery.gid) for count, gallery in enumerate(total_galleries_filtered, start=1): if gallery.gid in found_galleries: continue discarded_tags = self.general_utils.discard_by_tag_list( gallery.tags) if discarded_tags: logger.info( "Skipping gallery link {}, because it's tagged with global discarded tags: {}" .format(gallery.title, discarded_tags)) continue if wanted_filters: self.compare_gallery_with_wanted_filters( gallery, gallery.link, wanted_filters, gallery_wanted_lists) if wanted_only and not gallery_wanted_lists[gallery.gid]: continue logger.info( "Gallery {} of {}: Gallery {} (Real GID: {}) will be processed." .format(count, len(total_galleries_filtered), gallery.title, gallery.gid)) if gallery.thumbnail: original_thumbnail_url = gallery.thumbnail_url gallery.thumbnail_url = gallery.thumbnail gallery_obj = Gallery.objects.update_or_create_from_values( gallery) gallery_obj.thumbnail_url = original_thumbnail_url gallery_obj.save() else: Gallery.objects.update_or_create_from_values(gallery) for archive in gallery.archives: gallery.temp_archive = archive self.pass_gallery_data_to_downloaders([gallery], gallery_wanted_lists)
def crawl_urls(self, urls: List[str], wanted_filters=None, wanted_only: bool = False) -> None: for url in urls: response = request_with_retries( url, { 'headers': self.settings.requests_headers, 'timeout': self.settings.timeout_timer, 'cookies': self.own_settings.cookies }, post=False, logger=self.logger) dict_list = [] try: json_decoded = response.json() except (ValueError, KeyError): self.logger.error("Error parsing response to JSON: {}".format( response.text)) continue if type(json_decoded) == dict: if 'galleries' in json_decoded: dict_list = json_decoded['galleries'] else: dict_list.append(json_decoded) elif type(json_decoded) == list: dict_list = json_decoded galleries_gids = [] found_galleries = set() total_galleries_filtered: List[ChaikaGalleryData] = [] gallery_wanted_lists: Dict[ str, List['WantedGallery']] = defaultdict(list) for gallery in dict_list: if 'result' in gallery: continue galleries_gids.append(gallery['gid']) gallery['posted'] = datetime.fromtimestamp( int(gallery['posted']), timezone.utc) gallery_data = ChaikaGalleryData(**gallery) total_galleries_filtered.append(gallery_data) for galleries_gid_group in list(chunks(galleries_gids, 900)): for found_gallery in self.settings.gallery_model.objects.filter( gid__in=galleries_gid_group): discard_approved, discard_message = self.discard_gallery_by_internal_checks( gallery=found_gallery, link=found_gallery.get_link()) if discard_approved: self.logger.info(discard_message) found_galleries.add(found_gallery.gid) for gallery in total_galleries_filtered: if gallery.gid in found_galleries: continue if self.general_utils.discard_by_tag_list(gallery.tags): self.logger.info( "Skipping gallery {}, because it's tagged with global discarded tags" .format(gallery.title)) continue if wanted_filters: self.compare_gallery_with_wanted_filters( gallery, gallery.link, wanted_filters, gallery_wanted_lists) if wanted_only and not gallery_wanted_lists[gallery.gid]: continue self.logger.info("Gallery {} will be processed.".format( gallery.title)) if gallery.thumbnail: original_thumbnail_url = gallery.thumbnail_url gallery.thumbnail_url = gallery.thumbnail gallery_obj = self.settings.gallery_model.objects.update_or_create_from_values( gallery) gallery_obj.thumbnail_url = original_thumbnail_url gallery_obj.save() else: self.settings.gallery_model.objects.update_or_create_from_values( gallery) for archive in gallery.archives: gallery.archiver_key = archive self.pass_gallery_data_to_downloaders([gallery], gallery_wanted_lists)