Example #1
0
class SciHubAPI(QObject, threading.Thread):
    def __init__(self, query, callback=None, conf=None, log=None):
        QObject.__init__(self)
        threading.Thread.__init__(self)

        self._query = query
        self._callback = callback

        if conf:
            self._conf = conf
        else:
            self._conf = SciHubConf()

        if log:
            self.log = log

        self._sess = requests.Session()
        self._sess.headers = json.loads(
            self._conf.get('network', 'session_header'))

        retry_times = self._conf.getint('network', 'retry_times')
        retry = Retry(total=retry_times, read=retry_times, connect=retry_times)
        adapter = HTTPAdapter(max_retries=retry)
        self._sess.mount('http://', adapter)
        self._sess.mount('https://', adapter)

        self._set_http_proxy()

        self._doi_pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b'
        self._illegal_filename_pattern = r'[\/\\\:\*\?\"\<\>\|]'

    def log(self, message, type=None):
        if type:
            log_formater = '[{type}] - {message}'
        else:
            log_formater = '{message}'

        print(log_formater.format(type=type, message=message))

    def _set_http_proxy(self):
        if self._conf.getboolean('proxy', 'enabled'):
            proxy_type = self._conf.get('proxy', 'type')
            proxy_host = self._conf.get('proxy', 'host')
            proxy_port = self._conf.get('proxy', 'port')
            proxy_username = self._conf.get('proxy', 'username')
            proxy_password = self._conf.get('proxy', 'password')

            proxy = proxy_type + '://'

            if proxy_username and proxy_username != '':
                proxy += proxy_username

            if proxy_password and proxy_password != '':
                proxy += proxy_password

            if proxy_username and proxy_username != '':
                proxy += '@'

            proxy += proxy_host

            if proxy_port and proxy_port != '':
                proxy += ':' + proxy_port

            self._sess.proxies = {'http': proxy, 'https': proxy}

    def get_pdf_metadata(self, pdf_file_stream):
        metadata = {
            'author': 'UNKNOWN_AUTHOR',
            'title': 'UNKNOWN_TITLE',
            'year': 'UNKNOWN_YEAR'
        }

        pdf_parser = PDFParser(pdf_file_stream)
        pdf_doc = PDFDocument(pdf_parser)

        author = make_pdf_metadata_str(pdf_doc.info[0]['Author'])
        if author and author != '':
            metadata['author'] = author

        title = make_pdf_metadata_str(pdf_doc.info[0]['Title'])
        if title and title != '':
            metadata['title'] = title

        year = pdf_metadata_moddate_to_year(
            make_pdf_metadata_str(pdf_doc.info[0]['ModDate']))
        if year and year != '':
            metadata['year'] = year

        return metadata

    def guess_query_type(self, query):
        if query.startswith('http') or query.startswith('https'):
            if query.endswith('pdf'):
                query_type = 'pdf'
            else:
                query_type = 'url'
        elif query.isdigit():
            query_type = 'pmid'
        elif query.startswith('doi:') or re.match(self._doi_pattern, query):
            query_type = 'doi'
        else:
            query_type = 'unknown'

        log_formater = self.tr('Query type: ') + '{query_type}'
        self.log(log_formater.format(query_type=query_type.upper()), 'INFO')

        return query_type

    def fetch(self, query):
        query_type = self.guess_query_type(query)
        data = {}

        if query_type == 'unknown':
            data['error'] = self.tr('Unknown query type')
            return data

        current_scihub_url = self._conf.get('network', 'scihub_url')
        scihub_available_urls = json.loads(
            self._conf.get('network', 'scihub_available_urls'))
        current_scihub_url_index = scihub_available_urls.index(
            current_scihub_url)

        scihub_available_urls_ = scihub_available_urls[
            current_scihub_url_index:]
        scihub_available_urls_.extend(
            scihub_available_urls[:current_scihub_url_index])

        for round, scihub_url in enumerate(scihub_available_urls_):
            data = {}

            log_formater = self.tr('Using Sci-Hub URL: ') + '{scihub_url}'
            self.log(log_formater.format(scihub_url=scihub_url), 'INFO')
            self._conf.set('network', 'scihub_url', scihub_url)

            pdf_url = query

            if query_type != 'pdf':
                pdf_query_url = 'http://{scihub_url}/{query}'.format(
                    scihub_url=scihub_url, query=query)

                try:
                    self.log(self.tr('Fetching PDF URL ...'), 'INFO')

                    pdf_url_response = self._sess.get(
                        pdf_query_url,
                        verify=False,
                        timeout=self._conf.getfloat('network', 'timeout') /
                        1000.0)

                    html = etree.HTML(pdf_url_response.content)
                    iframes = html.xpath('//iframe')

                    if len(iframes) > 0:
                        iframe = iframes[0]
                        pdf_url = iframe.attrib['src']

                        log_formater = self.tr('Got PDF URL: ') + '{pdf_url}'
                        self.log(log_formater.format(pdf_url=pdf_url), 'INFO')
                    else:
                        data['error'] = self.tr('No valide iframe!')
                        self.log(self.tr('Failed to get PDF URL!'), 'ERROR')
                        self.log(data['error'], 'ERROR')
                except Exception as err:
                    data['error'] = str(err)
                    self.log(self.tr('Failed to get PDF!'), 'ERROR')
                    self.log(data['error'], 'ERROR')

            if not 'error' in data:
                filename = urlparse(pdf_url).path[1:].split('/')[-1]
                data['filename'] = re.sub(self._illegal_filename_pattern, '_',
                                          filename)

                self.log(self.tr('Fetching PDF ...'), 'INFO')

                try:
                    pdf_response = self._sess.get(
                        pdf_url,
                        verify=False,
                        timeout=self._conf.getfloat('network', 'timeout') /
                        1000.0)

                    if pdf_response.headers[
                            'Content-Type'] == 'application/pdf':
                        data['pdf'] = pdf_response.content

                        temp_pdf_file = tempfile.TemporaryFile()
                        temp_pdf_file.write(data['pdf'])
                        pdf_metadata = self.get_pdf_metadata(temp_pdf_file)
                        temp_pdf_file.close()

                        data = dict(data, **pdf_metadata)
                    else:
                        data['error'] = self.tr('Unknown Content-Type')
                        self.log(self.tr('Failed to get PDF!'), 'ERROR')
                        self.log(data['error'], 'ERROR')
                except Exception as err:
                    data['error'] = str(err)
                    self.log(self.tr('Failed to get PDF!'), 'ERROR')
                    self.log(data['error'], 'ERROR')

            if not 'error' in data:
                break
            else:
                if round == len(scihub_available_urls_) - 1:
                    self.log(self.tr('Failed with all Sci-Hub URLs!'), 'ERROR')
                else:
                    self.log(self.tr('Changing Sci-Hub URL ...'), 'INFO')

        return data

    def rampage(self, query):
        self.log('\n')
        log_formater = self.tr('Dealing with query: ') + '{query}'
        self.log(log_formater.format(query=query), 'INFO')

        data = self.fetch(query)

        if not 'error' in data:
            pdf_name_formater = self._conf.get(
                'common', 'filename_prefix_format') + '_{filename}'
            pdf_name = pdf_name_formater.format(**data)
            pdf_path = os.path.join(self._conf.get('common', 'save_to_dir'),
                                    pdf_name)

            with open(pdf_path, 'wb') as fp:
                fp.write(data['pdf'])

            log_formater = self.tr('Saved PDF as: ') + '{pdf_name}'
            self.log(log_formater.format(pdf_name=pdf_name), 'INFO')

    def run(self):
        self.rampage(self._query)

        if self._callback:
            self._callback()
Example #2
0
class SciHubAPI(QObject, threading.Thread):
    def __init__(self,
                 query,
                 log,
                 callback=None,
                 rampage_type=None,
                 conf=None,
                 **kwargs):
        QObject.__init__(self)
        threading.Thread.__init__(self)

        self._query = query
        self.log = log
        self._callback = callback
        self._rampage_type = rampage_type

        # Captcha answer, used only when rampage_type == SciHubRampageType.PDF_CAPTCHA_RESPONSE
        if 'captcha_answer' in kwargs:
            self._captcha_answer = kwargs['captcha_answer']

        if conf:
            self._conf = conf
        else:
            self._conf = SciHubConf('SciHubEVA.conf')

        self._sess = requests.Session()
        self._sess.headers = json.loads(
            self._conf.get('network', 'session_header'))

        retry_times = self._conf.getint('network', 'retry_times')
        retry = Retry(total=retry_times, read=retry_times, connect=retry_times)
        adapter = HTTPAdapter(max_retries=retry)
        self._sess.mount('http://', adapter)
        self._sess.mount('https://', adapter)

        self._set_http_proxy()

        self._doi_pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b'
        self._illegal_filename_pattern = r'[\/\\\:\*\?\"\<\>\|]'

    def _set_http_proxy(self):
        if self._conf.getboolean('proxy', 'enabled'):
            proxy_type = self._conf.get('proxy', 'type')
            proxy_host = self._conf.get('proxy', 'host')
            proxy_port = self._conf.get('proxy', 'port')
            proxy_username = self._conf.get('proxy', 'username')
            proxy_password = self._conf.get('proxy', 'password')

            proxy = proxy_type + '://'

            if proxy_username and proxy_username != '':
                proxy += proxy_username

            if proxy_password and proxy_password != '':
                proxy += proxy_password

            if proxy_username and proxy_username != '':
                proxy += '@'

            proxy += proxy_host

            if proxy_port and proxy_port != '':
                proxy += ':' + proxy_port

            self._sess.proxies = {'http': proxy, 'https': proxy}

    @staticmethod
    def get_pdf_metadata(pdf):
        """Get PDF metadata with PDF content

        Args:
            pdf: PDF content (in bytes)

        Returns:
            metadata: PDF metadata dictionary

        """

        temp_pdf_file = tempfile.TemporaryFile()
        temp_pdf_file.write(pdf)

        metadata = {
            'author': 'UNKNOWN_AUTHOR',
            'title': 'UNKNOWN_TITLE',
            'year': 'UNKNOWN_YEAR'
        }

        pdf_parser = PDFParser(temp_pdf_file)

        try:
            pdf_doc = PDFDocument(pdf_parser)
            pdf_metadata = pdf_doc.info[0]

            author = make_pdf_metadata_str(pdf_metadata.get('Author', ''))
            if author and author != '':
                metadata['author'] = author

            title = make_pdf_metadata_str(pdf_metadata.get('Title', ''))
            if title and title != '':
                metadata['title'] = title

            year = pdf_metadata_moddate_to_year(
                make_pdf_metadata_str(pdf_metadata.get('ModDate', '')))
            if year and year != '':
                metadata['year'] = year
        except Exception as e:
            pass

        temp_pdf_file.close()

        return metadata

    def guess_query_type(self, query):
        """Guess query type

        Args:
            query: Query

        Returns:
            query_type: Query type

        """

        if query.startswith('http') or query.startswith('https'):
            if query.endswith('pdf'):
                query_type = 'pdf'
            else:
                query_type = 'url'
        elif query.isdigit():
            query_type = 'pmid'
        elif query.startswith('doi:') or re.match(self._doi_pattern, query):
            query_type = 'doi'
        else:
            query_type = 'string'

        self.log(self.tr('Query type: ') + query_type.upper(), 'INFO')

        return query_type

    def get_captcha_info(self, pdf_captcha_response):
        """Get captcha information with PDF captcha response

        Args:
            pdf_captcha_response: PDF captcha response

        Returns:
            captcha_id: Captcha ID
            captcha_img_url: Captcha image URL

        """

        captcha_id, captcha_img_url = None, None

        html = etree.HTML(pdf_captcha_response.content)
        imgs = html.xpath('//img[@id="captcha"]')
        ids = html.xpath('//input[@name="id"]')

        if len(imgs) > 0 and len(ids) > 0:
            captcha_id = ids[0].attrib['value']
            captcha_img_src = imgs[0].attrib['src']

            if captcha_img_src.startswith('http'):
                captcha_img_url = captcha_img_src
            else:
                scheme, netloc, *_ = urlparse(pdf_captcha_response.url,
                                              scheme='http')
                captcha_img_url = scheme + '://' + netloc + captcha_img_src

        return captcha_id, captcha_img_url

    def download_captcha_img(self, captcha_img_url):
        """ Download captcha image

        Args:
            captcha_img_url: Captcha image URL

        Returns:
            Captcha image file

        """

        captcha_img_file = NamedTemporaryFile()

        captcha_img_res = self._sess.get(captcha_img_url, stream=True)

        if captcha_img_res.status_code == 200:
            for chuck in captcha_img_res:
                captcha_img_file.write(chuck)

        captcha_img_file.flush()

        return captcha_img_file

    def fetch_pdf_with_captcha(self, pdf_captcha_response):
        """Fetch PDF with captcha

        Args:
            pdf_captcha_response: PDF captcha response

        Returns:
            pdf: PDF content (in bytes)
            err: Error

        """

        pdf, err = None, None

        captcha_id, _ = self.get_captcha_info(pdf_captcha_response)

        pdf_response = self._sess.post(
            pdf_captcha_response.url,
            data={
                'answer': self._captcha_answer,
                'id': captcha_id
            },
            verify=False,
            timeout=self._conf.getfloat('network', 'timeout') / 1000.0)

        if pdf_response.headers['Content-Type'] == 'application/pdf':
            self.log(self.tr('Angel [CAPTCHA] down!'), 'INFO')
            pdf = pdf_response.content
        else:
            err = SciHubError.WRONG_CAPTCHA

        return pdf, err

    def fetch_pdf(self, pdf_url):
        """ Fetch PDF with PDF URL

        Args:
            pdf_url: PDF URL

        Returns:
            pdf: PDF (in bytes) or PDF captcha response (when downloading is blocked by captcha)
            err: Error

        """

        self.log(self.tr('Fetching PDF ...'), 'INFO')

        pdf, err = None, None

        pdf_response = self._sess.get(
            pdf_url,
            verify=False,
            timeout=self._conf.getfloat('network', 'timeout') / 1000.0)

        if pdf_response.headers['Content-Type'] == 'application/pdf':
            pdf = pdf_response.content
        elif pdf_response.headers['Content-Type'].startswith('text/html'):
            self.log(self.tr('Angel [CAPTCHA] is coming!'), 'WARN')
            err = SciHubError.BLOCKED_BY_CAPTCHA
            pdf = pdf_response
        else:
            self.log(self.tr('Unknown PDF Content-Type!'), 'ERROR')

        return pdf, err

    def fetch_pdf_url(self, query):
        """Fetch PDF URL with query

        Args:
            query: Query

        Returns:
            pdf_url: PDF URL
            err: Error

        """

        scihub_url = self._conf.get('network', 'scihub_url')
        self.log(
            self.tr('Using Sci-Hub URL: ') +
            '<a href="{scihub_url}">{scihub_url}</a>'.format(
                scihub_url=scihub_url), 'INFO')

        query_type = self.guess_query_type(query)
        pdf_url = query
        err = None

        if query_type != 'pdf':
            try:
                self.log(self.tr('Fetching PDF URL ...'), 'INFO')

                pdf_url_response = self._sess.post(
                    scihub_url,
                    data={'request': query},
                    verify=False,
                    timeout=self._conf.getfloat('network', 'timeout') / 1000.0)

                html = etree.HTML(pdf_url_response.content)
                iframes = html.xpath(
                    '//iframe[@id="pdf"]') if html is not None else None

                if iframes and len(iframes) > 0:
                    pdf_url = urlparse(iframes[0].attrib['src'],
                                       scheme='http').geturl()
                    pdf_url_html = '<a href="{pdf_url}">{pdf_url}</a>'.format(
                        pdf_url=pdf_url)

                    self.log(self.tr('Got PDF URL: ') + pdf_url_html, 'INFO')
                else:
                    err = SciHubError.NO_VALID_IFRAME
                    request_url = '{scihub_url}/{query}'.format(
                        scihub_url=scihub_url, query=query)
                    request_url_html = '<a href="{request_url}">{request_url}</a>'.format(
                        request_url=request_url)
                    response_url = pdf_url_response.url
                    response_url_html = '<a href="{response_url}">{response_url}</a>'.format(
                        response_url=response_url)

                    self.log(self.tr('Failed to get PDF URL!'), 'ERROR')
                    self.log(self.tr('No valid &lt;iframe&gt;!'), 'ERROR')
                    self.log(self.tr('You may need handle it manually.'),
                             'INFO')
                    self.log(
                        self.tr('Request URL: ') + request_url_html, 'INFO')
                    self.log(
                        self.tr('Response URL: ') + response_url_html, 'INFO')
            except Exception as e:
                err = SciHubError.UNKNOWN

                self.log(self.tr('Failed to get PDF URL!'), 'ERROR')
                self.log(str(e), 'ERROR')

        return pdf_url, err

    def save_pdf(self, pdf, filename):
        """Save pdf to local

        Args:
            pdf: PDF content (in bytes)
            filename: PDF filename

        """

        pdf_name_formatter = self._conf.get(
            'common', 'filename_prefix_format') + '_' + filename
        pdf_metadata = self.get_pdf_metadata(pdf)
        pdf_name = pdf_name_formatter.format(**pdf_metadata)
        pdf_path = os.path.join(self._conf.get('common', 'save_to_dir'),
                                pdf_name)

        with open(pdf_path, 'wb') as fp:
            fp.write(pdf)

        pdf_link = '<a href="file:///{pdf_path}">{pdf_path}</a>'.format(
            pdf_path=pdf_path)

        self.log(self.tr('Saved PDF as: ') + pdf_link, 'INFO')

    def rampage(self, query, rampage_type):
        """Main process of downloading PDF

        Args:
            query: Query (input, response of fetching PDF, ...)
            rampage_type: Rampage type

        Returns:
            res: Result of rampage, maybe used for next steps
            err: Error of rampage

            e.g. (None, None), (pdf_captcha_response, SciHubError.BLOCKED_BY_CAPTCHA), ...

        """

        if rampage_type == SciHubRampageType.INPUT:
            # Query is user input

            self.log('<hr/>')
            self.log(self.tr('Dealing with query: ') + query, 'INFO')

            # Fetch PDF URL
            pdf_url, err = self.fetch_pdf_url(query)
            if err is not None:
                return None, err

            # Fetch PDF
            pdf, err = self.fetch_pdf(pdf_url)
            if err == SciHubError.BLOCKED_BY_CAPTCHA:
                return pdf, err
            elif err is not None:
                return None, err

            # Save PDF
            filename = urlparse(pdf_url).path[1:].split('/')[-1]
            self.save_pdf(pdf, filename)
        elif rampage_type == SciHubRampageType.PDF_CAPTCHA_RESPONSE:
            # Query is PDF captcha response (with answer)

            # Fetch PDF with Captcha
            pdf, err = self.fetch_pdf_with_captcha(query)
            if err == SciHubError.WRONG_CAPTCHA:
                self.log(
                    self.tr('Wrong captcha, failed to kill Angel [CAPTCHA]!'),
                    'ERROR')
                return None, err

            # Save PDF
            filename = urlparse(query.url).path[1:].split('/')[-1]
            self.save_pdf(pdf, filename)

        return None, None

    def run(self):
        res, err = self.rampage(self._query, self._rampage_type)
        self._callback(res, err)