Esempio n. 1
0
    def crawl_feed(self, feed_url: str = '') -> list[str]:

        urls: list[str] = []

        if not feed_url:
            feed_url = constants.rss_url

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            feed_url,
            request_dict,
            post=False,
        )

        if not response:
            logger.error("Got no response from feed URL: {}".format(feed_url))
            return urls

        response.encoding = 'utf-8'

        feed = feedparser.parse(
            response.text
        )

        for item in feed['items']:
            if any([item['title'].startswith(category) for category in self.own_settings.accepted_rss_categories]):
                urls.append(item['link'])
        return urls
Esempio n. 2
0
    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'q': title}

        response = request_with_retries(
            constants.main_page,
            request_dict,
            post=False,
        )

        if not response:
            return False
        response.encoding = 'utf-8'

        m = re.finditer(r'a href="/view/(\d+)/*"', response.text)

        matches_links = set()

        if m:
            for match in m:
                matches_links.add("{}{}".format(
                    constants.gallery_container_url, match.group(1)))

        self.gallery_links = list(matches_links)

        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 3
0
    def compare_by_title_search_page(self, title: str) -> bool:

        # https://www.fakku.net/search/+title
        full_url = urljoin(constants.main_url, 'search/+') + quote(title)

        logger.info("Querying URL: {}".format(full_url))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            full_url,
            request_dict,
            post=False, retries=3
        )

        if not response:
            logger.info("Got no response from server")
            return False

        response.encoding = 'utf-8'
        soup_1 = BeautifulSoup(response.text, 'html.parser')

        matches_links = set()

        for link_container in soup_1.find_all("div", class_=re.compile("content-meta")):
            title_container = link_container.find("a", class_="content-title")
            matches_links.add(urljoin(constants.main_url, title_container.get('href')))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 4
0
    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_dict['params'] = {'type': 'title', 'search_value': title}

        r = request_with_retries(
            urljoin(constants.main_url, 'Search'),
            request_dict,
        )

        if not r:
            logger.info("Got no response from server")
            return False

        r.encoding = 'utf-8'
        soup_1 = BeautifulSoup(r.text, 'html.parser')

        matches_links = set()

        for gallery in soup_1.find_all(
                "div", class_=re.compile("showcase_comics_product_image_box")):
            link_container = gallery.find("a")
            if link_container:
                matches_links.add(
                    urljoin(constants.main_url, link_container['href']))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 5
0
    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        r = request_with_retries(
            urljoin(constants.main_url, 'search/') + quote(title),
            request_dict,
        )

        if not r:
            logger.info("Got no response from server")
            return False

        r.encoding = 'utf-8'
        soup_1 = BeautifulSoup(r.text, 'html.parser')

        matches_links = set()

        # content-row manga row
        for gallery in soup_1.find_all("div", class_=re.compile("content-row")):
            link_container = gallery.find("a", class_="content-title")
            if link_container:
                matches_links.add(urljoin(constants.main_url, link_container['href']))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 6
0
    def request_hath_download(self, root: str, gid: str, token: str,
                              key: str) -> Optional[requests.models.Response]:

        url = root + '/archiver.php'

        params = {'gid': gid, 'token': token, 'or': key}

        # logger.info("Requesting hath download to URL: {}".format(url))

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = params
        request_dict['data'] = {'hathdl_xres': 'org'}

        for retry_count in range(3):
            try:
                r = requests.post(url, **request_dict)
                return r
            except (requests.exceptions.Timeout,
                    requests.exceptions.ConnectionError) as e:
                if retry_count < 2:
                    logger.warning(
                        "Request failed, retrying {} of {}: {}".format(
                            retry_count, 3, str(e)))
                    continue
                else:
                    return None
        return None
Esempio n. 7
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

        if soup:
            title_jpn_match = soup.find("div", id=re.compile("info")).h2

            gallery_id_match = re.search(
                r'{}(\d+)'.format(constants.gallery_container_url), link)

            if not gallery_id_match:
                return None
            gallery_id = 'nh-' + gallery_id_match.group(1)

            gallery = GalleryData(gallery_id, self.name)
            gallery.title = soup.h1.get_text()
            gallery.title_jpn = title_jpn_match.get_text(
            ) if title_jpn_match else ''
            gallery_filecount_match = re.search(r'<div>(\d+) page(s*)</div>',
                                                response.text)
            if gallery_filecount_match:
                gallery.filecount = int(gallery_filecount_match.group(1))
            else:
                gallery.filecount = 0
            gallery.tags = []
            gallery.link = link
            gallery.posted = dateutil.parser.parse(
                soup.find("time")['datetime'])

            for tag_container in soup.find_all("a", {"class": "tag"}):
                tag_name = [text for text in tag_container.stripped_strings][0]
                tag_name = tag_name.split(" | ")[0]
                tag_scope = tag_container.parent.parent.get_text()
                tag_ext = tag_container.parent.get_text()
                tag_scope = tag_scope.replace(tag_ext, "").replace(
                    "\t", "").replace("\n", "").replace(":", "").lower()
                if tag_scope == 'tags':
                    gallery.tags.append(translate_tag(tag_name))
                elif tag_scope == 'categories':
                    gallery.category = tag_name.capitalize()
                else:
                    gallery.tags.append(
                        translate_tag(tag_scope + ":" + tag_name))

        else:
            return None
        return gallery
Esempio n. 8
0
    def get_galleries_from_page_links(self, page_links: Iterable[str], page_links_results: list[DataDict]) -> None:

        api_page_links = []

        for page_link in page_links:

            m = re.search(r'(.+)/s/(\w+)/(\d+)-(\d+)', page_link)
            if not m:
                continue
            api_page_links.append(
                {'data': [m.group(3), m.group(2), m.group(4)]})

        api_page_links_chunks = list(chunks(api_page_links, 25))

        for i, group in enumerate(api_page_links_chunks):

            if i % 3 == 2:
                time.sleep(self.own_settings.wait_timer)

            data = {
                'method': 'gtoken',
                'pagelist': [x['data'] for x in group]}

            headers = {'Content-Type': 'application/json'}

            request_dict = construct_request_dict(self.settings, self.own_settings)
            request_dict['headers'] = {**headers, **self.settings.requests_headers}
            request_dict['data'] = json.dumps(data)

            response = request_with_retries(
                constants.ge_api_url,
                request_dict,
                post=True,
            )

            if not response:
                continue
            try:
                response_data = response.json()
            except(ValueError, KeyError):
                logger.error("Could not parse response to JSON: {}".format(response.text))
                continue

            for gid_token_pair in response_data['tokenlist']:

                discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                    gallery_id=gid_token_pair['gid'],
                    link=link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)
                )

                if discard_approved:
                    if not self.settings.silent_processing:
                        logger.info(discard_message)
                    continue

                page_links_results.append(
                    {'data': (gid_token_pair['gid'], gid_token_pair['token']),
                     'link': link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)})
Esempio n. 9
0
    def compare_by_title(self, gallery_title: str) -> bool:

        api_url = urljoin(self.own_settings.metadata_url, constants.api_path)
        logger.info("Querying URL: {}".format(api_url))

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'match': True, 'title': gallery_title}

        response = request_with_retries(
            api_url,
            request_dict,
            post=False, retries=3
        )

        if not response:
            logger.info("Got no response from server")
            return False

        response_data = response.json()

        matches_links = set()

        if 'error' in response_data:
            logger.info("Got error from server: {}".format(response_data['error']))
            return False

        for gallery in response_data:
            if 'link' in gallery:
                matches_links.add(gallery['link'])
                if 'gallery_container' in gallery and gallery['gallery_container']:
                    if self.settings.gallery_model:
                        gallery_container = self.settings.gallery_model.objects.filter(
                            gid=gallery['gallery_container'], provider=gallery['provider']
                        )
                        first_gallery_container = gallery_container.first()
                        if first_gallery_container:
                            gallery['gallery_container_gid'] = first_gallery_container.gid
                if 'magazine' in gallery and gallery['magazine']:
                    if self.settings.gallery_model:
                        magazine = self.settings.gallery_model.objects.filter(
                            gid=gallery['magazine'], provider=gallery['provider']
                        )
                        first_magazine = magazine.first()
                        if first_magazine:
                            gallery['magazine_gid'] = first_magazine.gid
                if 'posted' in gallery:
                    if gallery['posted'] != 0:
                        gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc)
                    else:
                        gallery['posted'] = None
                self.values_array.append(GalleryData(**gallery))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 10
0
    def search_using_xml_api(self, title: str) -> bool:

        if not self.own_settings.api_key:
            logger.error("Can't use {} API without an api key. Check {}/API_MANUAL.txt".format(
                self.name,
                constants.main_page
            ))
            return False

        page = 1
        galleries = []

        while True:
            link = '{}/api/{}/?S=objectSearch&sn={}&page={}'.format(
                constants.main_page,
                self.own_settings.api_key,
                title,
                page
            )

            request_dict = construct_request_dict(self.settings, self.own_settings)

            response = request_with_retries(
                link,
                request_dict,
                post=False,
            )

            if not response:
                break

            response.encoding = 'utf-8'
            # Based on: https://www.doujinshi.org/API_MANUAL.txt

            api_galleries = convert_api_response_text_to_gallery_dicts(response.text)

            if not api_galleries:
                break

            galleries.extend(api_galleries)

            # API returns 25 max results per query, so if we get 24 or less, means there's no more pages.
            # API Manual says 25, but we get 50 results normally!
            if len(api_galleries) < 50:
                break

            page += 1

        self.values_array = galleries

        self.gallery_links = [x.link for x in galleries if x.link]
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 11
0
    def compare_by_image(self, zip_path: str, only_cover: bool) -> bool:

        if os.path.splitext(zip_path)[1] != '.zip':
            self.gallery_links = []
            return False

        try:
            my_zip = zipfile.ZipFile(zip_path, 'r')
        except (zipfile.BadZipFile, NotImplementedError):
            self.gallery_links = []
            return False

        filtered_files = get_images_from_zip(my_zip)

        if not filtered_files:
            self.gallery_links = []
            return False

        first_file = filtered_files[0]

        if first_file[1] is None:
            with my_zip.open(first_file[0]) as current_img:
                first_file_sha1 = sha1_from_file_object(current_img)
        else:
            with my_zip.open(first_file[1]) as current_zip:
                with zipfile.ZipFile(current_zip) as my_nested_zip:
                    with my_nested_zip.open(first_file[0]) as current_img:
                        first_file_sha1 = sha1_from_file_object(current_img)

        payload = {'f_shash': first_file_sha1,
                   'fs_from': os.path.basename(first_file[0]),
                   'fs_covers': 1 if only_cover else 0,
                   'fs_similar': 0}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = payload

        r = requests.get(
            constants.ex_page,
            **request_dict
        )

        my_zip.close()

        parser = SearchHTMLParser()
        parser.feed(r.text)

        self.gallery_links = list(parser.galleries)

        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 12
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        logger.info(
            "Downloading an archive from a generic HTTP server: {}".format(
                self.gallery.link))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_file = requests.get(self.gallery.link,
                                    stream='True',
                                    **request_dict)

        filename = get_filename_from_cd(
            request_file.headers.get('content-disposition'))

        if not filename:
            if self.gallery.link.find('/'):
                filename = self.gallery.link.rsplit('/', 1)[1]

        if not filename:
            logger.error("Could not find a filename for link: {}".format(
                self.gallery.link))
            self.return_code = 0

        self.gallery.title = filename.replace(".zip", "")
        self.gallery.filename = replace_illegal_name(
            available_filename(
                self.settings.MEDIA_ROOT,
                os.path.join(self.own_settings.archive_dl_folder, filename)))

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
Esempio n. 13
0
    def get_galleries_from_xml(self,
                               url_group: Iterable[str]) -> list[GalleryData]:

        possible_gallery_ids = [
            self.id_from_url(gallery_url) for gallery_url in url_group
        ]

        galleries_ids = [
            gallery_id.replace('mugi-B', 'B')
            for gallery_id in possible_gallery_ids if gallery_id
        ]

        galleries = list()

        gallery_chunks = list(chunks(galleries_ids, 25))

        for i, group in enumerate(gallery_chunks):
            logger.info(
                "Calling API ({}). Gallery group: {}, galleries in group: {}, total groups: {}"
                .format(self.name, i + 1, len(group), len(gallery_chunks)))

            # API doesn't say anything about needing to wait between requests, but we wait just in case.
            if i > 0:
                time.sleep(self.own_settings.wait_timer)

            link = constants.main_page + '/api/' + self.own_settings.api_key + '/?S=getID&ID=' + ",".join(
                galleries_ids)

            request_dict = construct_request_dict(self.settings,
                                                  self.own_settings)

            response = request_with_retries(
                link,
                request_dict,
                post=False,
            )

            if not response:
                continue

            response.encoding = 'utf-8'
            api_galleries = convert_api_response_text_to_gallery_dicts(
                response.text)

            if not api_galleries:
                continue
            galleries.extend(api_galleries)

        return galleries
Esempio n. 14
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'

        return self.process_regular_gallery_page(link, response.text)
Esempio n. 15
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.temp_archive:
            return

        logger.info(
            "Downloading an archive: {} from a Panda Backup-like source: {}".
            format(self.gallery.title, self.gallery.temp_archive['link']))

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))  # TODO: File could be cbz.

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['stream'] = True
        request_file = request_with_retries(
            self.gallery.temp_archive['link'],
            request_dict,
        )
        if not request_file:
            logger.error("Could not download archive")
            self.return_code = 0
            return
        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
Esempio n. 16
0
    def request_torrent_download(
            self, root: str, gid: str,
            token: str) -> Optional[requests.models.Response]:

        url = root + '/gallerytorrents.php'

        params = {'gid': gid, 't': token}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = params

        response = request_with_retries(
            url,
            request_dict,
            post=True,
        )

        return response
Esempio n. 17
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        link_root, gid, token = root_gid_token_from_link(link)

        if link_root is None or gid is None or token is None:
            return None

        if self.own_settings.use_ex_for_fjord and self.own_settings.cookies and link_root == constants.ex_api_url:
            api_page = constants.ex_api_url
        else:
            api_page = constants.ge_api_url

        data = utilities.request_data_from_gid_token_iterable([(gid, token)])

        headers = {'Content-Type': 'application/json'}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['headers'] = {**headers, **self.settings.requests_headers}
        request_dict['data'] = json.dumps(data)

        response = request_with_retries(
            api_page,
            request_dict,
            post=True,
        )

        if not response:
            return None
        try:
            response_data = response.json()
        except(ValueError, KeyError):
            logger.error("Could not parse response to JSON: {}".format(response.text))
            return None
        for gallery_data in response_data['gmetadata']:
            if 'error' in gallery_data:
                logger.error(
                    "Adding gallery {}: "
                    "failed with error: {}".format(gallery_data['gid'], gallery_data['error'])
                )
                return None
            internal_gallery_data = map_external_gallery_data_to_internal(gallery_data)
            return internal_gallery_data
        return None
Esempio n. 18
0
    def request_archive_download(
            self, root: str, gid: str, token: str,
            key: str) -> Optional[requests.models.Response]:

        url = root + '/archiver.php'

        params = {'gid': gid, 'token': token, 'or': key}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = params
        request_dict['data'] = constants.archive_download_data

        response = request_with_retries(
            url,
            request_dict,
            post=True,
        )

        return response
Esempio n. 19
0
    def compare_by_title_json(self, title: str) -> bool:

        # https://www.fakku.net/suggest/return%20of%20me
        headers = {
            'Content-Type': 'application/json',
            'Referer': constants.main_url + '/',
            'X-Requested-With': 'XMLHttpRequest',
        }

        logger.info("Querying URL: {}".format(urljoin(constants.main_url, 'suggest/') + quote(title.lower())))

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['headers'] = {**headers, **self.settings.requests_headers}

        response = request_with_retries(
            urljoin(constants.main_url, 'suggest/') + quote(title.lower()),
            request_dict,
            post=False, retries=3
        )

        if not response:
            logger.info("Got no response from server")
            return False

        response_data = response.json()

        matches_links = set()

        if 'error' in response_data:
            logger.info("Got error from server: {}".format(response_data['error']))
            return False

        for gallery in response_data:
            if gallery['type'] in ('doujinshi', 'manga', 'hentai', 'magazine'):
                matches_links.add(urljoin(constants.main_url, gallery['link']))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 20
0
    def compare_by_title_google(self, title: str) -> bool:

        payload = {'q': 'site:e-hentai.org ' + title}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = payload

        r = requests.get(
            "https://www.google.com/search",
            **request_dict
        )

        matches_links = set()

        m = re.finditer(r'(ex|g\.e-|e-)hentai\.org/g/(\d+)/(\w+)', r.text)

        if m:
            for match in m:
                matches_links.add(
                    self.get_final_link_from_link(
                        link_from_gid_token_fjord(match.group(2), match.group(3), False)
                    )
                )

        m2 = re.finditer(
            r'(ex|g\.e-|e-)hentai\.org/gallerytorrents\.php\?gid=(\d+)&t=(\w+)/', r.text)

        if m2:
            for match in m2:
                matches_links.add(
                    self.get_final_link_from_link(
                        link_from_gid_token_fjord(match.group(2), match.group(3), False)
                    )
                )

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True

        else:
            return False
Esempio n. 21
0
    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_dict['params'] = {'q': title}

        r = request_with_retries(
            urljoin(constants.main_url, 'search/'),
            request_dict,
        )

        if not r:
            logger.info("Got no response from server")
            return False

        r.encoding = 'utf-8'
        soup_1 = BeautifulSoup(r.text, 'html.parser')

        matches_links = set()

        discard_titles = ["(SPANISH)", "(FRENCH)"]

        for link_container in soup_1.find_all(
                "a", class_=re.compile("product-card")):
            # Discard spanish/french releases
            title_container = link_container.find("div",
                                                  class_="product-card__name")
            if any(x for x in discard_titles
                   if title_container.get_text().startswith(x)):
                continue
            matches_links.add(
                urljoin(
                    constants.main_url,
                    urljoin(link_container['href'],
                            urlparse(link_container['href']).path)))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 22
0
    def crawl_feed(self, feed_url: str = '') -> list[ChaikaGalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            feed_url,
            request_dict,
            post=False,
        )

        dict_list = []

        if not response:
            return []

        try:
            json_decoded = response.json()
        except (ValueError, KeyError):
            logger.error("Could not parse response to JSON: {}".format(
                response.text))
            return []

        if type(json_decoded) == dict:
            if 'galleries' in json_decoded:
                dict_list = json_decoded['galleries']
            else:
                dict_list.append(json_decoded)
        elif type(json_decoded) == list:
            dict_list = json_decoded

        total_galleries_filtered: list[ChaikaGalleryData] = []

        for gallery in dict_list:
            if 'result' in gallery:
                continue
            gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']),
                                                       timezone.utc)
            gallery_data = ChaikaGalleryData(**gallery)
            total_galleries_filtered.append(gallery_data)

        return total_galleries_filtered
Esempio n. 23
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        new_text = re.sub(r'(<div class="right">\d+?)</b>', r'\1',
                          response.text)

        if constants.main_url + '/magazines/' in link:
            return self.process_magazine_page(link, new_text)
        else:
            return self.process_regular_gallery_page(link, new_text)
Esempio n. 24
0
    def get_final_link_from_link(self, link: str) -> str:

        time.sleep(self.own_settings.wait_timer)
        gallery_gid, gallery_token = get_gid_token_from_link(link)
        gallery_link = link_from_gid_token_fjord(gallery_gid, gallery_token, True)

        request_dict = construct_request_dict(self.settings, self.own_settings)

        gallery_page_text = requests.get(
            gallery_link,
            **request_dict
        ).text

        if 'Gallery Not Available' in gallery_page_text:
            return gallery_link
        else:
            gallery_parser = GalleryHTMLParser()
            gallery_parser.feed(gallery_page_text)
            if gallery_parser.found_non_final_gallery == 2 and gallery_parser.non_final_gallery:
                return self.get_final_link_from_link(gallery_parser.non_final_gallery)
        return gallery_link
Esempio n. 25
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link or not self.gallery.archiver_key:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_file = requests.get(self.gallery.archiver_key,
                                    stream='True',
                                    **request_dict)

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            os.remove(filepath)
            self.return_code = 0
Esempio n. 26
0
    def compare_by_title(self, image_title: str) -> bool:

        filters = {'f_search': '"' + image_title + '"'}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = filters

        r = requests.get(
            constants.ex_page,
            **request_dict
        )

        parser = SearchHTMLParser()
        parser.feed(r.text)

        self.gallery_links = list(parser.galleries)

        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 27
0
    def parse_posted_date_from_feed(self, link: str,
                                    gid: str) -> Optional[datetime]:
        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'

        feed = feedparser.parse(response.text)

        for item in feed['items']:
            if gid in item['id']:
                return date_parser.parse(
                    item['published'],
                    tzinfos=constants.extra_feed_url_timezone)
        return None
Esempio n. 28
0
    def compare_by_title(self, title: str) -> bool:

        headers = {'Content-Type': 'application/json'}

        api_link = constants.posts_api_url
        payload = {'search': title}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['headers'] = {**headers, **self.settings.requests_headers}
        request_dict['params'] = payload

        response = request_with_retries(
            api_link,
            request_dict,
            post=False,
        )

        if not response:
            return False
        response.encoding = 'utf-8'
        try:
            response_data = response.json()
        except(ValueError, KeyError):
            return False

        matches_links = set()

        for gallery in response_data:
            matches_links.add(gallery['link'])

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 29
0
    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'q': title}

        r = requests.get("{}/search/".format(constants.main_page),
                         **request_dict)

        m = re.finditer(r'a href="/g/(\d+)/"', r.text)

        matches_links = set()

        if m:
            for match in m:
                matches_links.add("{}{}".format(
                    constants.gallery_container_url, match.group(1)))

        self.gallery_links = list(matches_links)

        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Esempio n. 30
0
    def start_download(self) -> None:

        if not self.gallery:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))

        if not (self.gallery.root and self.gallery.gid and self.gallery.token
                and self.gallery.archiver_key):
            logger.error(
                'Missing required data -> root: {}, gid: {}, token: {}, archiver_key: {}.'
                .format(
                    self.gallery.root,
                    self.gallery.gid,
                    self.gallery.token,
                    self.gallery.archiver_key,
                ))
            self.return_code = 0
            return

        r = self.request_archive_download(self.gallery.root, self.gallery.gid,
                                          self.gallery.token,
                                          self.gallery.archiver_key)

        if not r:
            logger.error('Could not get download link.')
            self.return_code = 0
            return

        r.encoding = 'utf-8'

        if 'Invalid archiver key' in r.text:
            logger.error("Invalid archiver key received.")
            self.return_code = 0
        else:

            archive_link = get_archive_link_from_html_page(r.text)

            if archive_link == '':
                logger.error(
                    'Could not find archive link, page text: {}'.format(
                        r.text))
                self.return_code = 0
            else:
                m = re.match(r"(.*?)(\?.*?)", archive_link)
                if m:
                    archive_link = m.group(1)

                logger.info('Got link: {}, from url: {}'.format(
                    archive_link, r.url))

                request_dict = construct_request_dict(self.settings,
                                                      self.own_settings)

                request_file = requests.get(archive_link + '?start=1',
                                            stream='True',
                                            **request_dict)

                if r and r.status_code == 200:
                    logger.info(
                        'Downloading gallery: {}.zip'.format(to_use_filename))
                    filepath = os.path.join(self.settings.MEDIA_ROOT,
                                            self.gallery.filename)
                    with open(filepath, 'wb') as fo:
                        for chunk in request_file.iter_content(4096):
                            fo.write(chunk)

                    self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
                        filepath)
                    if self.gallery.filesize > 0:
                        self.crc32 = calc_crc32(filepath)

                        self.fileDownloaded = 1
                        self.return_code = 1

                else:
                    logger.error("Could not download archive")
                    self.return_code = 0